From 5d0cf228d6797dd157dd8e52ab58675dcd5c4ca3 Mon Sep 17 00:00:00 2001 From: Alexander Zhirov Date: Sat, 13 Sep 2025 03:15:22 +0300 Subject: [PATCH] =?UTF-8?q?=D0=94=D0=BE=D0=B1=D0=B0=D0=B2=D0=BB=D0=B5?= =?UTF-8?q?=D0=BD=20=D0=B0=D0=BD=D0=B3=D0=BB=D0=BE=D1=8F=D0=B7=D1=8B=D1=87?= =?UTF-8?q?=D0=BD=D1=8B=D0=B9=20DDoc?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- source/cdcdb/core.d | 16 +++---- source/cdcdb/dblite.d | 8 ++-- source/cdcdb/snapshot.d | 74 +++++++++++++++++++++++++++-- source/cdcdb/storage.d | 102 ++++++++++++++++++++++++++++++++++++---- 4 files changed, 173 insertions(+), 27 deletions(-) diff --git a/source/cdcdb/core.d b/source/cdcdb/core.d index 874c23b..5ea8b4a 100644 --- a/source/cdcdb/core.d +++ b/source/cdcdb/core.d @@ -5,12 +5,12 @@ import std.digest.sha : SHA256, digest; struct Chunk { size_t index; // 1..N - size_t offset; // смещение в исходном буфере - size_t size; // размер чанка - immutable(ubyte)[32] sha256; // hex(SHA-256) содержимого + size_t offset; // offset in the source buffer + size_t size; // chunk size + immutable(ubyte)[32] sha256; // hex(SHA-256) of the content } -// Change Data Capture (Захват изменения данных) +// Change Data Capture (CDC) final class CDC { private: @@ -36,13 +36,13 @@ private: ulong fingerprint = 0; size_t index; - // инициализация без cut-check + // initialization without a cut-check while (index < _minSize) { fingerprint = (fingerprint << 1) + _gear[src[index]]; ++index; } - // строгая маска + // strict mask while (index < normalSize) { fingerprint = (fingerprint << 1) + _gear[src[index]]; @@ -50,7 +50,7 @@ private: return index; ++index; } - // слабая маска + // weak mask while (index < size) { fingerprint = (fingerprint << 1) + _gear[src[index]]; @@ -65,7 +65,7 @@ public: this(size_t minSize, size_t normalSize, size_t maxSize, ulong maskS, ulong maskL) @safe @nogc nothrow { assert(minSize > 0 && minSize < normalSize && normalSize < maxSize, - "Неверные размеры: требуется min < normal < max и min > 0"); + "Invalid sizes: require min < normal < max and min > 0"); _minSize = minSize; _normalSize = normalSize; _maxSize = maxSize; diff --git a/source/cdcdb/dblite.d b/source/cdcdb/dblite.d index 3127aaf..3e01338 100644 --- a/source/cdcdb/dblite.d +++ b/source/cdcdb/dblite.d @@ -87,7 +87,7 @@ private: if (msg.toLower.canFind("locked", "busy")) { if (--tryNo == 0) { throw new Exception( - "Не удалось выполнить подключение к базе данных после %d неудачных попыток: %s" + "Failed to connect to the database after %d failed attempts: %s" .format(_maxRetries, msg) ); } @@ -99,7 +99,7 @@ private: throw new Exception(msg); } - // Проверка БД на наличие существующих в ней необходимых таблиц + // Check that the database contains the required tables; otherwise create them void check() { SqliteResult queryResult = sql( @@ -123,7 +123,7 @@ private: } enforce(missingTables.length == 0 || missingTables.length == 3, - "База данных повреждена. Отсутствуют таблицы: " ~ missingTables.join(", ") + "Database is corrupted. Missing tables: " ~ missingTables.join(", ") ); if (missingTables.length == 3) @@ -221,7 +221,7 @@ public: ); if (queryResult.empty()) { - throw new Exception("Ошибка при добавлении нового снимока в базу данных"); + throw new Exception("Error adding a new snapshot to the database"); } return queryResult.front()["id"].to!long; diff --git a/source/cdcdb/snapshot.d b/source/cdcdb/snapshot.d index e06e8a5..49d6d33 100644 --- a/source/cdcdb/snapshot.d +++ b/source/cdcdb/snapshot.d @@ -8,6 +8,28 @@ import std.digest.sha : SHA256, digest; import std.datetime : DateTime; import std.exception : enforce; +/** + * Snapshot reader and lifecycle helper. + * + * This class reconstructs full file content from chunked storage persisted + * via `DBLite`, verifies integrity (per-chunk SHA-256 and final file hash), + * and provides a safe way to remove a snapshot record. + * + * Usage: + * --- + * auto s1 = new Snapshot(db, snapshotId); + * auto bytes = s1.data(); // materialize full content in memory + * + * // or stream into a sink to avoid large allocations: + * s1.data((const(ubyte)[] part) { + * // consume part + * }); + * --- + * + * Notes: + * - All integrity checks are enforced; any mismatch throws. + * - `data(void delegate(...))` is preferred for very large files. + */ final class Snapshot { private: @@ -19,32 +41,50 @@ private: ubyte[] bytes; if (chunk.zstd) { - enforce(chunk.zSize == chunk.content.length, "Размер сжатого фрагмента не соответствует ожидаемому"); + enforce(chunk.zSize == chunk.content.length, "Compressed chunk size does not match the expected value"); bytes = cast(ubyte[]) uncompress(chunk.content); } else { bytes = chunk.content.dup; } - enforce(chunk.size == bytes.length, "Оригинальный размер не соответствует ожидаемому"); - enforce(chunk.sha256 == digest!SHA256(bytes), "Хеш-сумма фрагмента не совпадает"); + enforce(chunk.size == bytes.length, "Original size does not match the expected value"); + enforce(chunk.sha256 == digest!SHA256(bytes), "Chunk hash does not match"); return bytes; } public: + /// Construct a `Snapshot` from an already fetched `DBSnapshot` row. + /// + /// Params: + /// dblite = database handle + /// dbSnapshot = snapshot row (metadata) previously retrieved this(DBLite dblite, DBSnapshot dbSnapshot) { _db = dblite; _snapshot = dbSnapshot; } + /// Construct a `Snapshot` by loading metadata from the database. + /// + /// Params: + /// dblite = database handle + /// idSnapshot = snapshot id to load this(DBLite dblite, long idSnapshot) { _db = dblite; _snapshot = _db.getSnapshot(idSnapshot); } + /// Materialize the full file content in memory. + /// + /// Reassembles all chunks in order, validates each chunk SHA-256 and the + /// final file SHA-256 (`snapshots.sha256`). + /// + /// Returns: full file content as a newly allocated `ubyte[]` + /// + /// Throws: Exception on any integrity check failure ubyte[] data() { auto chunks = _db.getChunks(_snapshot.id); @@ -60,11 +100,20 @@ public: fctx.put(bytes); } - enforce(_snapshot.sha256 == fctx.finish(), "Хеш-сумма файла не совпадает"); + enforce(_snapshot.sha256 == fctx.finish(), "File hash does not match"); return content; } + /// Stream the full file content into a caller-provided sink. + /// + /// This variant avoids allocating a single large buffer. Chunks are + /// decoded, verified, and passed to `sink` in order. + /// + /// Params: + /// sink = delegate invoked for each verified chunk (may be called many times) + /// + /// Throws: Exception on any integrity check failure void data(void delegate(const(ubyte)[]) sink) { auto chunks = _db.getChunks(_snapshot.id); @@ -77,9 +126,17 @@ public: fctx.put(bytes); } - enforce(_snapshot.sha256 == fctx.finish(), "Хеш-сумма файла не совпадает"); + enforce(_snapshot.sha256 == fctx.finish(), "File hash does not match"); } + /// Remove this snapshot from the database inside a transaction. + /// + /// Starts an IMMEDIATE transaction, deletes the snapshot row, and commits. + /// On any failure it rolls back. + /// + /// Returns: `true` if the snapshot row was deleted, `false` otherwise + /// + /// Note: Does not garbage-collect unreferenced blobs; perform that separately. bool remove() { _db.beginImmediate(); @@ -103,31 +160,37 @@ public: return _snapshot.id == idDeleted; } + /// Snapshot id (primary key). @property long id() const nothrow @safe { return _snapshot.id; } + /// User-defined label. @property string label() const @safe { return _snapshot.label; } + /// Creation timestamp (UTC) from the database. @property DateTime created() const @safe { return _snapshot.createdUtc; } + /// Original file length in bytes. @property long length() const nothrow @safe { return _snapshot.sourceLength; } + /// Expected SHA-256 of the full file (32 raw bytes). @property ubyte[32] sha256() const nothrow @safe { return _snapshot.sha256; } + /// Snapshot status as a string (enum to string). @property string status() const { import std.conv : to; @@ -135,6 +198,7 @@ public: return _snapshot.status.to!string; } + /// Optional human-readable description. @property string description() const nothrow @safe { return _snapshot.description; diff --git a/source/cdcdb/storage.d b/source/cdcdb/storage.d index 6f4d64f..74df78d 100644 --- a/source/cdcdb/storage.d +++ b/source/cdcdb/storage.d @@ -6,14 +6,35 @@ import cdcdb.snapshot; import zstd : compress, Level; +/** + * High-level storage facade: splits data into CDC chunks, stores chunks/blobs + * into SQLite via `DBLite`, links them into snapshots, and returns `Snapshot` + * objects for retrieval and deletion. + * + * Features: + * - FastCDC-based content-defined chunking (configurable sizes/masks) + * - Optional Zstandard compression (level configurable) + * - Idempotent snapshot creation: skips if identical to the latest for label + * + * Typical usage: + * --- + * auto store = new Storage("cdc.sqlite", true, Level.default_); + * store.setupCDC(4096, 8192, 16384, 0x3FFF, 0x03FF); + * + * auto snap = store.newSnapshot("my.txt", data, "initial import"); + * auto bytes = snap.data(); // retrieve + * + * auto removed = store.removeSnapshots("my.txt"); // remove by label + * --- + */ final class Storage { private: - // Параметры работы с базой данных + // Database parameters DBLite _db; bool _zstd; int _level; - // Настройки CDC механизма + // CDC settings CDC _cdc; size_t _minSize; size_t _normalSize; @@ -29,11 +50,19 @@ private: _maxSize = maxSize; _maskS = maskS; _maskL = maskL; - // CDC не хранит динамически выделенных данных, переинициализация безопасна + // CDC holds no dynamically allocated state; reinitialization is safe _cdc = new CDC(_minSize, _normalSize, _maxSize, _maskS, _maskL); } public: + /// Construct the storage facade and open (or create) the database. + /// + /// Params: + /// database = path to SQLite file + /// zstd = enable Zstandard compression for stored blobs + /// level = Zstd compression level (see `zstd.Level`) + /// busyTimeout = SQLite busy timeout in milliseconds + /// maxRetries = max retries on SQLITE_BUSY/LOCKED errors this(string database, bool zstd = false, int level = Level.base, size_t busyTimeout = 3000, size_t maxRetries = 3) { _db = new DBLite(database, busyTimeout, maxRetries); @@ -42,23 +71,44 @@ public: initCDC(); } + /// Reconfigure CDC parameters (takes effect for subsequent snapshots). + /// + /// Params: + /// minSize, normalSize, maxSize, maskS, maskL = FastCDC parameters void setupCDC(size_t minSize, size_t normalSize, size_t maxSize, size_t maskS, size_t maskL) { initCDC(minSize, normalSize, maxSize, maskS, maskL); } + /// Create a new snapshot from raw data. + /// + /// - Splits data with FastCDC using current settings. + /// - Optionally compresses chunks with Zstd. + /// - Stores unique blobs and links them to the created snapshot. + /// - If the latest snapshot for `label` already has the same file SHA-256, + /// returns `null` (idempotent). + /// + /// Params: + /// label = user-provided snapshot label (file identifier) + /// data = raw file bytes + /// description = optional human-readable description + /// + /// Returns: a `Snapshot` instance for the created snapshot, or `null` + /// + /// Throws: + /// Exception if `data` is empty or on database/storage errors Snapshot newSnapshot(string label, const(ubyte)[] data, string description = string.init) { if (data.length == 0) { - throw new Exception("Данные имеют нулевой размер"); + throw new Exception("Data has zero length"); } import std.digest.sha : SHA256, digest; ubyte[32] sha256 = digest!SHA256(data); - // Если последний снимок файла соответствует текущему состоянию + // If the last snapshot for the label matches current content if (_db.isLast(label, sha256)) return null; @@ -95,10 +145,10 @@ public: dbBlob.zstd = _zstd; - // Разбить на фрагменты + // Split into chunks Chunk[] chunks = _cdc.split(data); - // Запись фрагментов в БД + // Write chunks to DB foreach (chunk; chunks) { dbBlob.sha256 = chunk.sha256; @@ -118,7 +168,7 @@ public: dbBlob.content = content.dup; } - // Запись фрагментов + // Store/ensure blob _db.addBlob(dbBlob); dbSnapshotChunk.snapshotId = idSnapshot; @@ -126,7 +176,7 @@ public: dbSnapshotChunk.offset = chunk.offset; dbSnapshotChunk.sha256 = chunk.sha256; - // Привязка фрагментов к снимку + // Link chunk to snapshot _db.addSnapshotChunk(dbSnapshotChunk); } @@ -137,23 +187,52 @@ public: return snapshot; } - // Удаляет снимок по метке, возвращает количество удаленных снимков + /// Delete snapshots by label. + /// + /// Params: + /// label = snapshot label + /// + /// Returns: number of deleted snapshots long removeSnapshots(string label) { return _db.deleteSnapshot(label); } + /// Delete a specific snapshot instance. + /// + /// Params: + /// snapshot = `Snapshot` to remove + /// + /// Returns: `true` on success, `false` otherwise bool removeSnapshots(Snapshot snapshot) { return removeSnapshots(snapshot.id); } + /// Delete a snapshot by id. + /// + /// Params: + /// idSnapshot = snapshot id + /// + /// Returns: `true` if the row was deleted bool removeSnapshots(long idSnapshot) { return _db.deleteSnapshot(idSnapshot) == idSnapshot; } + /// Get a `Snapshot` object by id. + /// + /// Params: + /// idSnapshot = snapshot id + /// + /// Returns: `Snapshot` handle (metadata loaded lazily via constructor) Snapshot getSnapshot(long idSnapshot) { return new Snapshot(_db, idSnapshot); } + /// List snapshots (optionally filtered by label). + /// + /// Params: + /// label = filter by exact label; empty string returns all + /// + /// Returns: array of `Snapshot` handles Snapshot[] getSnapshots(string label = string.init) { Snapshot[] snapshots; @@ -164,6 +243,9 @@ public: return snapshots; } + /// Library version string. + /// + /// Returns: semantic version of the `cdcdb` library string getVersion() const @safe nothrow { import cdcdb.version_ : cdcdbVersion;