From 6a2df148ecc640c759b66558c657c921dfa6661f Mon Sep 17 00:00:00 2001 From: Jakob Scheid Date: Sat, 28 Feb 2026 02:18:03 +0100 Subject: [PATCH] Removed all the functions and classes that are now in jeb-server-utils, updated index URL --- README.md | 5 +- deploy.sh | 2 +- pyproject.toml | 2 +- src/jeb_utils/jeb_utils.py | 454 ------------------------------------- 4 files changed, 6 insertions(+), 457 deletions(-) diff --git a/README.md b/README.md index 45de270..781df25 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ Common utils for JEB client and server. You can install the library using `pip`: ```bash -pip install jeb-utils --index-url https://jcloud-services.ddns.net/simple/ --extra-index-url https://pypi.org/simple/ +pip install jeb-utils --index-url https://repo.jcloud-services.ddns.net/simple/ --extra-index-url https://pypi.org/simple/ ``` ## Functions and classes @@ -77,6 +77,9 @@ pip install jeb-utils --index-url https://jcloud-services.ddns.net/simple/ --ext - `create_file_if_not_exists`: Creates a file if it does not exist. ## Changelog +### Version 0.2.0 +- Removed all the functions and classes that are now in jeb-server-utils. + ### Version 0.1.4 - Bug fix: Due to a refactoring, `jeb_utils.jebp_utils.unpack_fields` did not work. diff --git a/deploy.sh b/deploy.sh index 688313e..36fb1d6 100755 --- a/deploy.sh +++ b/deploy.sh @@ -15,4 +15,4 @@ # limitations under the License. python3 -m build -scp dist/* jcloud@jcloud-services.ddns.net:/srv/data/jcloud/htdocs/simple/config-parser \ No newline at end of file +scp dist/* jcloud@jcloud-services.ddns.net:/srv/data/wwwstatic/repo/simple/config-parser \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index a8b0397..dcb16f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "jeb-utils" -version = "0.1.4" +version = "0.2.0" description = "Common utils for JEB client and server." dependencies = ["cryptography"] license = "Apache-2.0" \ No newline at end of file diff --git a/src/jeb_utils/jeb_utils.py b/src/jeb_utils/jeb_utils.py index 392d488..4dc5804 100644 --- a/src/jeb_utils/jeb_utils.py +++ b/src/jeb_utils/jeb_utils.py @@ -34,28 +34,13 @@ __all__ = [ 'COMPRESSION_TYPE_ZSTD', 'FETCH_START_TYPE_OFFSET', 'FETCH_START_TYPE_TIMESTAMP', - 'FETCH_RECORDS_CHUNK_SIZE', - 'SEGMENT_MAX_SIZE', 'format_topic_partitions', - 'mktopic', - 'rmtopic', - 'mktopicpart', - 'rmtopicpart', - 'get_topics', - 'get_segment_base_timestamp', - 'get_log_end_offsets', - 'init', 'get_next_lower_index_entry', 'validate_topic_partition', 'validate_topic_name', 'validate_topic_format', - 'check_topic_exists', 'pack_record_headers', 'unpack_record_headers', - 'create_record', - 'fetch_records', - 'Topic', - 'Segment', 'FileCorruptWarning', 'AttributesByte' ] @@ -69,11 +54,6 @@ COMPRESSION_TYPE_LZ4 = 3 COMPRESSION_TYPE_ZSTD = 4 FETCH_START_TYPE_OFFSET = 0xc0 FETCH_START_TYPE_TIMESTAMP = 0xc1 -FETCH_RECORDS_CHUNK_SIZE = 1024 ** 2 # 1 MiB -SEGMENT_MAX_SIZE = None -DATA_DIR = None - -log_end_offsets = {} def format_topic_partitions(topic_name: str, partitions: set) -> list: ''' @@ -89,180 +69,6 @@ def format_topic_partitions(topic_name: str, partitions: set) -> list: ''' return [f'{topic_name}{TOPIC_PARTITION_SEPARATOR}{p:02d}' for p in partitions] -def mktopic(topic_name: str): - ''' - Creates a topic. - - :param topic_name: The name of the topic - :type topic_name: str - ''' - with dbm.open(f'{DATA_DIR}/conf/topics', 'c') as db: - if topic_name.encode() in db.keys(): - raise exceptions.TopicExistsError(f'topic \'{topic_name}\' already exists') - db[topic_name] = pickle.dumps({'partitions': {1}}) - db.close() - - os.mkdir(f'{DATA_DIR}/topics/{topic_name}{TOPIC_PARTITION_SEPARATOR}01/') - -def rmtopic(topic_name: str): - ''' - Removes a topic. - - :param topic_name: The topic name - :type topic_name: str - - :raises TopicNotFoundError: If the topic does not exist. - ''' - with dbm.open(f'{DATA_DIR}/conf/topics', 'c') as db: - if topic_name.encode() not in db.keys(): - raise exceptions.TopicNotFoundError(f'topic \'{topic_name}\' does not exist') - partitions = pickle.loads(db[topic_name.encode()])['partitions'] - del db[topic_name] - db.close() - - partition_dirs = format_topic_partitions(topic_name, partitions) - for pd in partition_dirs: - try: - shutil.rmtree(f'{DATA_DIR}/topics/{pd}/') - except: - pass - -def mktopicpart(topic_name: str, partition_number: int): - ''' - Creates a partition of a topic. - - :param topic_name: The topic name - :type topic_name: str - :param partition_number: The partition number - :type partition_number: int - ''' - if partition_number < 1 or partition_number > 99: - raise ValueError('partition number must be between 1 and 99') - formatted_partition_number = f'{(int(partition_number)):02d}' - with dbm.open(f'{DATA_DIR}/conf/topics', 'c') as db: - if topic_name.encode() not in db.keys(): - raise exceptions.TopicNotFoundError(f'topic \'{topic_name}\' does not exist') - partitions = pickle.loads(db[topic_name.encode()])['partitions'] - if partition_number in partitions: - raise exceptions.TopicPartitionExistsError(f'partition \'{formatted_partition_number}\' already exists for topic \'{topic_name}\'') - db[topic_name] = pickle.dumps({'partitions': partitions | {int(formatted_partition_number)}}) - db.close() - - os.mkdir(f'{DATA_DIR}/topics/{topic_name}{TOPIC_PARTITION_SEPARATOR}{formatted_partition_number}/') - -def rmtopicpart(topic_name: str, partition_number: int): - ''' - Removes a partition of a topic. - - :param topic_name: The topic name - :type topic_name: str - :param partition_number: The partition number - :type partition_number: int - ''' - formatted_partition_number = f'{(int(partition_number)):02d}' - with dbm.open(f'{DATA_DIR}/conf/topics', 'c') as db: - if topic_name.encode() not in db.keys(): - raise exceptions.TopicNotFoundError(f'topic \'{topic_name}\' does not exist') - partitions = pickle.loads(db[topic_name.encode()])['partitions'] - if partition_number not in partitions: - raise exceptions.TopicPartitionNotFoundError(f'partition \'{formatted_partition_number}\' does not exist for topic \'{topic_name}\'') - db[topic_name] = pickle.dumps({'partitions': partitions - {int(formatted_partition_number)}}) - db.close() - - shutil.rmtree(f'{DATA_DIR}/topics/{topic_name}{TOPIC_PARTITION_SEPARATOR}{formatted_partition_number}/') - -def get_topics(): - ''' - Returns all topics. - - :return: The topics - :rtype: dict - ''' - with dbm.open(f'{DATA_DIR}/conf/topics', 'c') as db: - topics = {k.decode(): pickle.loads(v) for k, v in db.items()} - db.close() - return topics - -def _create_base_segment_files(topics: dict): - for topic, v in topics.items(): - partitions = format_topic_partitions(topic, v['partitions']) - for p in partitions: - utils.create_file_if_not_exists(f'{DATA_DIR}/topics/{p}/' + '0' * 16 + '.log') - utils.create_file_if_not_exists(f'{DATA_DIR}/topics/{p}/' + '0' * 16 + '.index') - utils.create_file_if_not_exists(f'{DATA_DIR}/topics/{p}/' + '0' * 16 + '.timeindex') - -def get_segment_base_timestamp(topic: str, segment: int) -> int: - ''' - Returns the base timestamp of a segment. - - :param topic: The topic - :type topic: str - :param segment: The segment base offset - :type segment: int - - :return: The base timestamp (milliseconds since 1970) - :rtype: int - ''' - with open(f'{DATA_DIR}/topics/{topic}/{segment:016x}.log', 'rb') as logfile: - # go to the topic length - logfile.seek(20) - - # skip topic - topic_length = int.from_bytes(logfile.read(2)) - logfile.seek(topic_length + 1, os.SEEK_CUR) - - timestamp_bytes = logfile.read(8) - if len(timestamp_bytes) < 8: - timestamp = None - else: - timestamp = int.from_bytes(timestamp_bytes) - - return timestamp - -def get_log_end_offsets(topics: dict): - _topics = topics - topics = [f'{t}{TOPIC_PARTITION_SEPARATOR}{p:02d}' for t, v in _topics.items() for p in v['partitions']] - last_topics_segments = {t: sorted([s for s in os.listdir(f'{DATA_DIR}/topics/{t}') if s.endswith('.log') and utils.is_number(s.split('.')[0], 16) and len(s.split('.')[0]) == 16])[-1] for t in topics} - topic_segments = {t: Topic(t).segments for t in topics} - for topic, segments in topic_segments.items(): - log_end_offset = 0 - for segment in segments: - with open(f'{DATA_DIR}/topics/{topic}/{segment.base_offset:016x}.log', 'rb') as logfile: - logfile.seek(0, os.SEEK_END) - file_size = logfile.tell() - logfile.seek(0) - - while True: - record_size_bytes = logfile.read(8) - if not record_size_bytes or len(record_size_bytes) < 8: - break - record_size = int.from_bytes(record_size_bytes) - if logfile.tell() + record_size > file_size: - break - logfile.seek(record_size, os.SEEK_CUR) - log_end_offset += 1 - logfile.close() - log_end_offsets[topic] = log_end_offset - -def init(config: dict = {'segment_max_size': 1024 * 256, 'data_dir': './data'}): - ''' - Initializes the module. Calling this function is required before using most of the functions of the module. - - :param config: The configuration - :type config: dict - ''' - global SEGMENT_MAX_SIZE, DATA_DIR - SEGMENT_MAX_SIZE = config.get('segment_max_size', SEGMENT_MAX_SIZE) - DATA_DIR = config.get('data_dir', DATA_DIR) - - topics = get_topics() - # segments = {t: sorted([int(s.split('.')[0], 16) for s in os.listdir(f'{DATA_DIR}/topics/{t}') if s.endswith('.log') and utils.is_number(s.split('.')[0], 16) and len(s.split('.')[0]) == 16]) for t in [f'{t}{TOPIC_PARTITION_SEPARATOR}{p:02d}' for t, v in topics.items() for p in v['partitions']]} - - # base_timestamps = {t: {s: Segment(t, s).base_timestamp for s in ss} for t, ss in segments.items()} - - _create_base_segment_files(topics) - get_log_end_offsets(topics) - def get_next_lower_index_entry(offset: int): return utils.get_next_lower_integer_multiple(offset, 1024) @@ -320,24 +126,6 @@ def validate_topic_format(topic: str): if len(formatted_partition_number) != 2 or not formatted_partition_number.isdigit(): raise ValueError('invalid partition number format') -def check_topic_exists(topic: str): - ''' - Checks whether a topic exists. - - :param topic: The topic name - :type topic: str - - :raises TopicNotFoundError: If the topic does not exist - :raises TopicPartitionNotFoundError: If the partition does not exist - ''' - topics = get_topics() - topic_name, formatted_partition_number = topic.split(TOPIC_PARTITION_SEPARATOR) - raw_partition_number = int(formatted_partition_number) - if topic_name not in topics: - raise exceptions.TopicNotFoundError(f'topic \'{topic_name}\' does not exist') - if raw_partition_number not in topics[topic_name]['partitions']: - raise exceptions.TopicPartitionNotFoundError(f'partition \'{formatted_partition_number}\' does not exist for topic \'{topic_name}\'') - def pack_record_headers(headers: dict) -> bytes: ''' Packs the record header dictionary into bytes. @@ -394,248 +182,6 @@ def unpack_record_headers(packed: bytes) -> dict: buffer = io.BytesIO(packed) return _unpack_record_headers(buffer) -def create_record(topic: str, timestamp: int, record_data: bytes, key: bytes = b'', compression_type: int = 0, headers: dict = {}): - ''' - Creates a record. - - :param topic: The topic name - :type topic: str - :param timestamp: The timestamp. - :type timestamp: int - :param record_data: The payload - :type record_data: bytes - :param key: The key - :type key: bytes - :param compression_type: The compression type - :type compression_type: int - :param headers: The header dictionary - :type headers: dict - ''' - global log_end_offsets - validate_topic_format(topic) - topic_name, formatted_partition_number = topic.split(TOPIC_PARTITION_SEPARATOR) - - check_topic_exists(topic) - - topic_dir = f'{DATA_DIR}/topics/{topic_name}{TOPIC_PARTITION_SEPARATOR}{formatted_partition_number}/' - utils.create_file_if_not_exists(topic_dir + '0' * 16 + '.log') - segments = sorted([int(f.split('.')[0], 16) for f in os.listdir(topic_dir) if f.endswith('.log') and utils.is_number(f.split('.')[0], 16) and len(f.split('.')[0]) == 16]) - last_segment = max(segments) - - if timestamp == 0: - timestamp = int(time.time() * 1000) - timestamp_type = 1 - else: - timestamp_type = 0 - - # Assemble record - - record = b'' - record += log_end_offsets[topic].to_bytes(8) # Offset - record += binascii.crc32(record_data).to_bytes(4) # CRC - record += (len(topic)).to_bytes(2) # Topic name length - record += topic.encode() # Topic - record += AttributesByte(compression_type, timestamp_type).to_byte() # Attributes - record += timestamp.to_bytes(8) # Timestamp - record += (len(key)).to_bytes(4) # Key Length - record += key # Key - record += (len(record_data)).to_bytes(4) # Content Length - record += record_data # Content - record += (len(headers)).to_bytes(1) - for key, value in headers.items(): - record += len(key).to_bytes(4) - record += key - record += len(value).to_bytes(4) - record += value - - - record = len(record).to_bytes(8) + record # Prepend Record Size - - if os.path.getsize(topic_dir + f'{last_segment:016x}.log') + len(record) > SEGMENT_MAX_SIZE: - last_segment = log_end_offsets[topic] - - with open(topic_dir + f'{last_segment:016x}.log', 'ab') as logfile: - logfile.write(record) - logfile.close() - - utils.create_file_if_not_exists(topic_dir + f'{last_segment:016x}.index') - utils.create_file_if_not_exists(topic_dir + f'{last_segment:016x}.timeindex') - - if log_end_offsets[topic] % 1024 == 0: - index = log_end_offsets[topic].to_bytes(8) + os.path.getsize(topic_dir + f'{last_segment:016x}.log').to_bytes(4) - timeindex = timestamp.to_bytes(8) + log_end_offsets[topic].to_bytes(8) - with open(topic_dir + f'{last_segment:016x}.index', 'ab') as idxfile: - idxfile.write(index) - idxfile.close() - with open(topic_dir + f'{last_segment:016x}.timeindex', 'ab') as timeidxfile: - timeidxfile.write(timeindex) - timeidxfile.close() - - log_end_offsets[topic] += 1 - -async def fetch_records(topic: str, start_type: int, start: int, max_bytes: int, send_block_function: FunctionType | MethodType): - ''' - Fetches records from the topic and sends it using ``send_block_function`` - - :param topic: The topic name - :type topic: str - :param start_type: The start type - :type start_type: int - :param start: The start - :type start: int - :param max_bytes: The maximum of bytes - :type max_bytes: int - :param send_block_function: The function to send the chunks - :type send_block_function: FunctionType | MethodType - ''' - topic = Topic(topic) - topics = get_topics() - validate_topic_format(topic.topic_name) - topic_name, formatted_partition_number = topic.topic_name.split(TOPIC_PARTITION_SEPARATOR) - raw_partition_number = int(formatted_partition_number) - if start_type not in (FETCH_START_TYPE_OFFSET, FETCH_START_TYPE_TIMESTAMP): - raise ValueError('invalid start type') - check_topic_exists(topic.topic_name) - segments = [int(f.split('.')[0], 16) for f in os.listdir(f'{DATA_DIR}/topics/{topic.topic_name}/') if utils.is_number(f.split('.')[0], 16) and len(f.split('.')[0]) == 16 and f.endswith('.log')] - - if start_type == FETCH_START_TYPE_TIMESTAMP: - segment_base_timestamps = {s: s.base_timestamp for s in topic.segments} - s = {v: k for k, v in segment_base_timestamps.items()}[utils.find_nearest_lower_number(list(segment_base_timestamps.values()), start) or min(segment_base_timestamps.values())] - with open(f'{DATA_DIR}/topics/{topic.topic_name}/{s.base_offset:016x}.timeindex', 'rb') as f: - timestamps = {} - f.seek(0) - while True: - timestamp_bytes = f.read(8) - if len(timestamp_bytes) < 8: - break - timestamp = int.from_bytes(timestamp_bytes) - - offset_bytes = f.read(8) - if len(offset_bytes) < 8: - break - offset = int.from_bytes(offset_bytes) - - timestamps[timestamp] = offset - f.close() - start = timestamps[utils.find_nearest_lower_number(timestamps.keys(), start) or min(timestamps.keys())] - - - if start < 0 or start >= log_end_offsets[topic.topic_name]: - await send_block_function(b'\xb1\x31') - return - idx_offset = get_next_lower_index_entry(start) - - - start_seg = utils.find_nearest_lower_number(segments, start) - - - with open(f'{DATA_DIR}/topics/{topic.topic_name}/' + f'{start_seg:016x}' + '.index', 'rb') as idxfile: - idxfile.seek(0, os.SEEK_END) - file_size = idxfile.tell() - idxfile.seek(idx_offset // 1024 * 12) - index_entry = idxfile.read(12) - idx_entry_pos = int.from_bytes(index_entry[8:]) - if int.from_bytes(index_entry[:8]) != idx_offset: - warnings.warn(f'Index entry offset mismatch: expected {idx_offset}, got {int.from_bytes(index_entry[:8])}', FileCorruptWarning) - idxfile.close() - - with open(f'{DATA_DIR}/topics/{topic.topic_name}/{start_seg:016x}.log', 'rb') as logfile: - logfile.seek(idx_entry_pos) - offset = idx_offset - record_position = 0 - while offset < start: - record_length_bytes = logfile.read(8) - if len(record_length_bytes) < 8: - break - record_length = int.from_bytes(record_length_bytes) - - offset_bytes = logfile.read(8) - if len(offset_bytes) < 8: - break - offset = int.from_bytes(offset_bytes) - - record_position = logfile.tell() - 16 - logfile.seek(record_length - 8, os.SEEK_CUR) - logfile.close() - - - - await send_block_function(b'\xa0\x30') - - segments = sorted(segments) - - bytes_fetched = 0 - print(max_bytes) - for i, segment in enumerate(segments[segments.index(start_seg):]): - records_sent = 0 - with open(f'{DATA_DIR}/topics/{topic.topic_name}/' + f'{segment:016x}' + '.log', 'rb') as logfile: - if i == 0: - logfile.seek(record_position) - else: - logfile.seek(0) - while bytes_fetched <= max_bytes: - record_size_bytes = logfile.read(8) - if len(record_size_bytes) < 8: - break - record_size = int.from_bytes(record_size_bytes) - if bytes_fetched + 8 + record_size > max_bytes: - break - - to_send = record_size_bytes - - for _ in range(record_size // FETCH_RECORDS_CHUNK_SIZE): - to_send += logfile.read(FETCH_RECORDS_CHUNK_SIZE) - await send_block_function(b'\x01' + to_send) - to_send = b'' - to_send += logfile.read(record_size % FETCH_RECORDS_CHUNK_SIZE) - await send_block_function(b'\x01' + to_send) - - bytes_fetched += 8 + record_size - records_sent += 1 - - logfile.close() - - - - if bytes_fetched >= max_bytes: - break - - - - - - await send_block_function(b'\x00') # EOF - -class Topic: - def __init__(self, topic_name: str): - self.topic_name = topic_name - - def __repr__(self): - return f'Topic({self.topic_name})' - - @property - def segments(self): - return [Segment(self.topic_name, bo) for bo in sorted([int(s.split('.')[0], 16) for s in os.listdir(f'{DATA_DIR}/topics/{self.topic_name}') if s.endswith('.log') and utils.is_number(s.split('.')[0], 16) and len(s.split('.')[0]) == 16])] - -class Segment: - def __init__(self, topic: str, base_offset: int): - self.topic = topic - self.base_offset = base_offset - self.base_timestamp = self.get_base_timestamp() - - def __repr__(self): - return f'Segment({self.topic}, {self.base_offset})' - - def get_base_timestamp(self): - with open(f'{DATA_DIR}/topics/{self.topic}/{self.base_offset:016x}.timeindex', 'rb') as logfile: - timestamp_bytes = logfile.read(8) - if len(timestamp_bytes) < 8: - timestamp = None - else: - timestamp = int.from_bytes(timestamp_bytes) - - return timestamp - class FileCorruptWarning(UserWarning): ... class AttributesByte: