From f6387ebc6589e54641faee7c82c2414cc2adda75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcel=20Kr=C3=B6ker?= Date: Sat, 19 Nov 2022 22:16:42 +0100 Subject: [PATCH 1/4] init --- dictdatabase/byte_codes.py | 1 + dictdatabase/indexing.py | 40 +++++++++++++++++++++++++- dictdatabase/utils.py | 57 +++++++++++++++++++++++++++++++++++++- 3 files changed, 96 insertions(+), 2 deletions(-) diff --git a/dictdatabase/byte_codes.py b/dictdatabase/byte_codes.py index 5f50482..7d6383d 100644 --- a/dictdatabase/byte_codes.py +++ b/dictdatabase/byte_codes.py @@ -8,4 +8,5 @@ SPACE = 32 TAB = 9 NEWLINE = 10 +COLON = 58 COMMA = 44 diff --git a/dictdatabase/indexing.py b/dictdatabase/indexing.py index c5eaabc..760224a 100644 --- a/dictdatabase/indexing.py +++ b/dictdatabase/indexing.py @@ -1,6 +1,7 @@ +from dataclasses import dataclass import orjson import os -from . import config +from . import config, utils, byte_codes, io_bytes # Problem: Multiple read processes will concurrently read and write the same file # In some cases this will result in a empty read error, thats why the try-except exists @@ -21,6 +22,42 @@ # - Leave everything as is. While not ideal, it works. When empty read error occurs, don't use the index for that read + + + + +@dataclass +class KeyFinderState: + skip_next = False + in_str = False + list_depth = 0 + dict_depth = 1 + key_start = None + key_end = None + value_end = None + indices = [] + i = 1 + + +def batched_find_all_top_level_keys(db_name): + state, b = KeyFinderState(), 0 + while True: + batch_start = b * 10_000_000 + batch_end = batch_start + 10_000_000 + + batch_bytes = io_bytes.read_bytes(db_name, batch_start, batch_end) + + if batch_start == 0 and batch_bytes[0] != byte_codes.OPEN_CURLY: + raise ValueError("The first byte of the database file must be an opening curly brace") + if len(batch_bytes) == 0: + break + utils.find_all_top_level_keys(batch_bytes, state, len(batch_bytes)) + return state.indices + + + + + class Indexer: """ The Indexer takes the name of a database file, and tries to load the .index file @@ -57,6 +94,7 @@ def __init__(self, db_name: str): self.data = {} + def get(self, key): """ Returns a list of 5 elements for a key if it exists, otherwise None diff --git a/dictdatabase/utils.py b/dictdatabase/utils.py index 052c3cf..e1ecf27 100644 --- a/dictdatabase/utils.py +++ b/dictdatabase/utils.py @@ -1,8 +1,10 @@ from __future__ import annotations +from dataclasses import dataclass from typing import Tuple import os import glob from . import config, byte_codes +from . indexing import KeyFinderState def file_info(db_name: str) -> Tuple[str, bool, str, bool]: @@ -37,17 +39,70 @@ def find_all(file_name: str) -> list[str]: return files_all + +def find_all_top_level_keys(json_bytes: bytes, state: KeyFinderState, batch_size: int) -> KeyFinderState: + """ + In the bytes of the json object find all top level keys and the start and end + indices of their values. + """ + + while state.i < batch_size: + current = json_bytes[state.i] + if state.skip_next: + state.skip_next = False + elif current == byte_codes.BACKSLASH: + state.skip_next = True + elif current == byte_codes.QUOTE: + if state.dict_depth == 1 and state.list_depth == 0: + if state.in_str: + state.key_end = state.i + state.i += 1 + while json_bytes[state.i] in [byte_codes.SPACE, byte_codes.COLON]: + state.i += 1 + state.value_start = state.i + else: + state.key_start = state.i + 1 + state.in_str = not state.in_str + elif state.in_str or current in [byte_codes.SPACE, byte_codes.TAB, byte_codes.NEWLINE]: + pass + elif current == byte_codes.OPEN_SQUARE: + state.list_depth += 1 + elif current == byte_codes.CLOSE_SQUARE: + state.list_depth -= 1 + elif current == byte_codes.OPEN_CURLY: + state.dict_depth += 1 + elif current == byte_codes.CLOSE_CURLY: + state.dict_depth -= 1 + elif state.list_depth == 0 and state.dict_depth == 1: + state.indices.append((json_bytes[state.key_start:state.key_end].decode(), state.value_start, state.i + 1)) + state.i += 1 + + + + + + + def seek_index_through_value_bytes(json_bytes: bytes, index: int) -> int: """ Finds the index of the next comma or closing bracket/brace after the value of a key-value pair in a bytes object containing valid JSON when decoded. + Valid start indices are the index after the colon or the index after that. + + Example: + + 01234567 + "2": {}, + + Valid start indices are 4 and 5. Returns 7. + Args: - `json_bytes`: A bytes object containing valid JSON when decoded - `index`: The start index in json_bytes Returns: - - The end index of the value. + - The end index of the first byte right after the value's bytes. """ # See https://www.json.org/json-en.html for the JSON syntax From 11180c6f8c0937470bb59311bcd7cb31fd99af1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcel=20Kr=C3=B6ker?= Date: Sun, 20 Nov 2022 10:45:58 +0100 Subject: [PATCH 2/4] wip --- dictdatabase/byte_codes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dictdatabase/byte_codes.py b/dictdatabase/byte_codes.py index 7d6383d..8204afa 100644 --- a/dictdatabase/byte_codes.py +++ b/dictdatabase/byte_codes.py @@ -10,3 +10,5 @@ NEWLINE = 10 COLON = 58 COMMA = 44 +DIGIT_0 = 48 +DIGIT_9 = 57 From 9d6c72b69747b829d1760120bd7809018477a74a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcel=20Kr=C3=B6ker?= Date: Sun, 20 Nov 2022 14:42:35 +0100 Subject: [PATCH 3/4] unused byte codes --- dictdatabase/byte_codes.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/dictdatabase/byte_codes.py b/dictdatabase/byte_codes.py index 8204afa..7d6383d 100644 --- a/dictdatabase/byte_codes.py +++ b/dictdatabase/byte_codes.py @@ -10,5 +10,3 @@ NEWLINE = 10 COLON = 58 COMMA = 44 -DIGIT_0 = 48 -DIGIT_9 = 57 From 1dbe28509c4f824747f157d6f17e6e7cc8c1d7c7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sun, 20 Nov 2022 13:43:35 +0000 Subject: [PATCH 4/4] Updated assets/coverage.svg --- assets/coverage.svg | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/assets/coverage.svg b/assets/coverage.svg index 6bfc8fa..fe06143 100644 --- a/assets/coverage.svg +++ b/assets/coverage.svg @@ -9,13 +9,13 @@ - + coverage coverage - 99% - 99% + 94% + 94%