diff --git a/README.md b/README.md index 4fcde34..9ac8a90 100644 --- a/README.md +++ b/README.md @@ -77,6 +77,9 @@ pip install jeb-utils --index-url https://jcloud-services.ddns.net/simple/ --ext - `create_file_if_not_exists`: Creates a file if it does not exist. ## Changelog +### Version 0.1.1 +- Support for custom data paths (`jeb_utils.init`) + ### Version 0.1.0 - First release - `auth_utils` diff --git a/pyproject.toml b/pyproject.toml index 4bf604c..5628f88 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,6 +4,6 @@ build-backend = "setuptools.build_meta" [project] name = "jeb-utils" -version = "0.1.0" +version = "0.1.1" description = "Common utils for JEB client and server." dependencies = ["cryptography"] \ No newline at end of file diff --git a/src/jeb_utils/jeb_utils.py b/src/jeb_utils/jeb_utils.py index 756a931..06b4aee 100644 --- a/src/jeb_utils/jeb_utils.py +++ b/src/jeb_utils/jeb_utils.py @@ -57,6 +57,7 @@ FETCH_START_TYPE_OFFSET = 0xc0 FETCH_START_TYPE_TIMESTAMP = 0xc1 FETCH_RECORDS_CHUNK_SIZE = 1024 ** 2 # 1 MiB SEGMENT_MAX_SIZE = None +DATA_DIR = None log_end_offsets = {} @@ -81,13 +82,13 @@ def mktopic(topic_name: str): :param topic_name: The name of the topic :type topic_name: str ''' - with dbm.open('data/conf/topics', 'c') as db: + with dbm.open(f'{DATA_DIR}/conf/topics', 'c') as db: if topic_name.encode() in db.keys(): raise exceptions.TopicExistsError(f'topic \'{topic_name}\' already exists') db[topic_name] = pickle.dumps({'partitions': {1}}) db.close() - os.mkdir(f'data/topics/{topic_name}{TOPIC_PARTITION_SEPARATOR}01/') + os.mkdir(f'{DATA_DIR}/topics/{topic_name}{TOPIC_PARTITION_SEPARATOR}01/') def rmtopic(topic_name: str): ''' @@ -98,7 +99,7 @@ def rmtopic(topic_name: str): :raises TopicNotFoundError: If the topic does not exist. ''' - with dbm.open('data/conf/topics', 'c') as db: + with dbm.open(f'{DATA_DIR}/conf/topics', 'c') as db: if topic_name.encode() not in db.keys(): raise exceptions.TopicNotFoundError(f'topic \'{topic_name}\' does not exist') partitions = pickle.loads(db[topic_name.encode()])['partitions'] @@ -108,7 +109,7 @@ def rmtopic(topic_name: str): partition_dirs = format_topic_partitions(topic_name, partitions) for pd in partition_dirs: try: - shutil.rmtree(f'data/topics/{pd}/') + shutil.rmtree(f'{DATA_DIR}/topics/{pd}/') except: pass @@ -124,7 +125,7 @@ def mktopicpart(topic_name: str, partition_number: int): if partition_number < 1 or partition_number > 99: raise ValueError('partition number must be between 1 and 99') formatted_partition_number = f'{(int(partition_number)):02d}' - with dbm.open('data/conf/topics', 'c') as db: + with dbm.open(f'{DATA_DIR}/conf/topics', 'c') as db: if topic_name.encode() not in db.keys(): raise exceptions.exceptions.TopicNotFoundError(f'topic \'{topic_name}\' does not exist') partitions = pickle.loads(db[topic_name.encode()])['partitions'] @@ -133,7 +134,7 @@ def mktopicpart(topic_name: str, partition_number: int): db[topic_name] = pickle.dumps({'partitions': partitions | {int(formatted_partition_number)}}) db.close() - os.mkdir(f'data/topics/{topic_name}{TOPIC_PARTITION_SEPARATOR}{formatted_partition_number}/') + os.mkdir(f'{DATA_DIR}/topics/{topic_name}{TOPIC_PARTITION_SEPARATOR}{formatted_partition_number}/') def rmtopicpart(topic_name: str, partition_number: int): ''' @@ -145,7 +146,7 @@ def rmtopicpart(topic_name: str, partition_number: int): :type partition_number: int ''' formatted_partition_number = f'{(int(partition_number)):02d}' - with dbm.open('data/conf/topics', 'c') as db: + with dbm.open(f'{DATA_DIR}/conf/topics', 'c') as db: if topic_name.encode() not in db.keys(): raise exceptions.TopicNotFoundError(f'topic \'{topic_name}\' does not exist') partitions = pickle.loads(db[topic_name.encode()])['partitions'] @@ -154,7 +155,7 @@ def rmtopicpart(topic_name: str, partition_number: int): db[topic_name] = pickle.dumps({'partitions': partitions - {int(formatted_partition_number)}}) db.close() - shutil.rmtree(f'data/topics/{topic_name}{TOPIC_PARTITION_SEPARATOR}{formatted_partition_number}/') + shutil.rmtree(f'{DATA_DIR}/topics/{topic_name}{TOPIC_PARTITION_SEPARATOR}{formatted_partition_number}/') def get_topics(): ''' @@ -163,7 +164,7 @@ def get_topics(): :return: The topics :rtype: dict ''' - with dbm.open('data/conf/topics', 'c') as db: + with dbm.open(f'{DATA_DIR}/conf/topics', 'c') as db: topics = {k.decode(): pickle.loads(v) for k, v in db.items()} db.close() return topics @@ -172,9 +173,9 @@ def _create_base_segment_files(topics: dict): for topic, v in topics.items(): partitions = format_topic_partitions(topic, v['partitions']) for p in partitions: - utils.create_file_if_not_exists(f'data/topics/{p}/' + '0' * 16 + '.log') - utils.create_file_if_not_exists(f'data/topics/{p}/' + '0' * 16 + '.index') - utils.create_file_if_not_exists(f'data/topics/{p}/' + '0' * 16 + '.timeindex') + utils.create_file_if_not_exists(f'{DATA_DIR}/topics/{p}/' + '0' * 16 + '.log') + utils.create_file_if_not_exists(f'{DATA_DIR}/topics/{p}/' + '0' * 16 + '.index') + utils.create_file_if_not_exists(f'{DATA_DIR}/topics/{p}/' + '0' * 16 + '.timeindex') def get_segment_base_timestamp(topic: str, segment: int) -> int: ''' @@ -188,7 +189,7 @@ def get_segment_base_timestamp(topic: str, segment: int) -> int: :return: The base timestamp (milliseconds since 1970) :rtype: int ''' - with open(f'data/topics/{topic}/{segment:016x}.log', 'rb') as logfile: + with open(f'{DATA_DIR}/topics/{topic}/{segment:016x}.log', 'rb') as logfile: # go to the topic length logfile.seek(20) @@ -207,12 +208,12 @@ def get_segment_base_timestamp(topic: str, segment: int) -> int: def get_log_end_offsets(topics: dict): _topics = topics topics = [f'{t}{TOPIC_PARTITION_SEPARATOR}{p:02d}' for t, v in _topics.items() for p in v['partitions']] - last_topics_segments = {t: sorted([s for s in os.listdir(f'data/topics/{t}') if s.endswith('.log') and utils.is_number(s.split('.')[0], 16) and len(s.split('.')[0]) == 16])[-1] for t in topics} + last_topics_segments = {t: sorted([s for s in os.listdir(f'{DATA_DIR}/topics/{t}') if s.endswith('.log') and utils.is_number(s.split('.')[0], 16) and len(s.split('.')[0]) == 16])[-1] for t in topics} topic_segments = {t: Topic(t).segments for t in topics} for topic, segments in topic_segments.items(): log_end_offset = 0 for segment in segments: - with open(f'data/topics/{topic}/{segment.base_offset:016x}.log', 'rb') as logfile: + with open(f'{DATA_DIR}/topics/{topic}/{segment.base_offset:016x}.log', 'rb') as logfile: logfile.seek(0, os.SEEK_END) file_size = logfile.tell() logfile.seek(0) @@ -229,18 +230,19 @@ def get_log_end_offsets(topics: dict): logfile.close() log_end_offsets[topic] = log_end_offset -def init(config: dict = {'segment_max_size': 1024 * 256}): +def init(config: dict = {'segment_max_size': 1024 * 256, 'data_dir': './data'}): ''' Initializes the module. Calling this function is required before using most of the functions of the module. :param config: The configuration :type config: dict ''' - global SEGMENT_MAX_SIZE + global SEGMENT_MAX_SIZE, DATA_DIR SEGMENT_MAX_SIZE = config.get('segment_max_size', SEGMENT_MAX_SIZE) + DATA_DIR = config.get('data_dir', DATA_DIR) topics = get_topics() - # segments = {t: sorted([int(s.split('.')[0], 16) for s in os.listdir(f'data/topics/{t}') if s.endswith('.log') and utils.is_number(s.split('.')[0], 16) and len(s.split('.')[0]) == 16]) for t in [f'{t}{TOPIC_PARTITION_SEPARATOR}{p:02d}' for t, v in topics.items() for p in v['partitions']]} + # segments = {t: sorted([int(s.split('.')[0], 16) for s in os.listdir(f'{DATA_DIR}/topics/{t}') if s.endswith('.log') and utils.is_number(s.split('.')[0], 16) and len(s.split('.')[0]) == 16]) for t in [f'{t}{TOPIC_PARTITION_SEPARATOR}{p:02d}' for t, v in topics.items() for p in v['partitions']]} # base_timestamps = {t: {s: Segment(t, s).base_timestamp for s in ss} for t, ss in segments.items()} @@ -399,7 +401,7 @@ def create_record(topic: str, timestamp: int, record_data: bytes, key: bytes = b check_topic_exists(topic) - topic_dir = f'data/topics/{topic_name}{TOPIC_PARTITION_SEPARATOR}{formatted_partition_number}/' + topic_dir = f'{DATA_DIR}/topics/{topic_name}{TOPIC_PARTITION_SEPARATOR}{formatted_partition_number}/' utils.create_file_if_not_exists(topic_dir + '0' * 16 + '.log') segments = sorted([int(f.split('.')[0], 16) for f in os.listdir(topic_dir) if f.endswith('.log') and utils.is_number(f.split('.')[0], 16) and len(f.split('.')[0]) == 16]) last_segment = max(segments) @@ -478,12 +480,12 @@ async def fetch_records(topic: str, start_type: int, start: int, max_bytes: int, if start_type not in (FETCH_START_TYPE_OFFSET, FETCH_START_TYPE_TIMESTAMP): raise ValueError('invalid start type') check_topic_exists(topic.topic_name) - segments = [int(f.split('.')[0], 16) for f in os.listdir(f'data/topics/{topic.topic_name}/') if utils.is_number(f.split('.')[0], 16) and len(f.split('.')[0]) == 16 and f.endswith('.log')] + segments = [int(f.split('.')[0], 16) for f in os.listdir(f'{DATA_DIR}/topics/{topic.topic_name}/') if utils.is_number(f.split('.')[0], 16) and len(f.split('.')[0]) == 16 and f.endswith('.log')] if start_type == FETCH_START_TYPE_TIMESTAMP: segment_base_timestamps = {s: s.base_timestamp for s in topic.segments} s = {v: k for k, v in segment_base_timestamps.items()}[utils.find_nearest_lower_number(list(segment_base_timestamps.values()), start) or min(segment_base_timestamps.values())] - with open(f'data/topics/{topic.topic_name}/{s.base_offset:016x}.timeindex', 'rb') as f: + with open(f'{DATA_DIR}/topics/{topic.topic_name}/{s.base_offset:016x}.timeindex', 'rb') as f: timestamps = {} f.seek(0) while True: @@ -511,7 +513,7 @@ async def fetch_records(topic: str, start_type: int, start: int, max_bytes: int, start_seg = utils.find_nearest_lower_number(segments, start) - with open(f'data/topics/{topic.topic_name}/' + f'{start_seg:016x}' + '.index', 'rb') as idxfile: + with open(f'{DATA_DIR}/topics/{topic.topic_name}/' + f'{start_seg:016x}' + '.index', 'rb') as idxfile: idxfile.seek(0, os.SEEK_END) file_size = idxfile.tell() idxfile.seek(idx_offset // 1024 * 12) @@ -521,7 +523,7 @@ async def fetch_records(topic: str, start_type: int, start: int, max_bytes: int, warnings.warn(f'Index entry offset mismatch: expected {idx_offset}, got {int.from_bytes(index_entry[:8])}', FileCorruptWarning) idxfile.close() - with open(f'data/topics/{topic.topic_name}/{start_seg:016x}.log', 'rb') as logfile: + with open(f'{DATA_DIR}/topics/{topic.topic_name}/{start_seg:016x}.log', 'rb') as logfile: logfile.seek(idx_entry_pos) offset = idx_offset record_position = 0 @@ -550,7 +552,7 @@ async def fetch_records(topic: str, start_type: int, start: int, max_bytes: int, print(max_bytes) for i, segment in enumerate(segments[segments.index(start_seg):]): records_sent = 0 - with open(f'data/topics/{topic.topic_name}/' + f'{segment:016x}' + '.log', 'rb') as logfile: + with open(f'{DATA_DIR}/topics/{topic.topic_name}/' + f'{segment:016x}' + '.log', 'rb') as logfile: if i == 0: logfile.seek(record_position) else: @@ -597,7 +599,7 @@ class Topic: @property def segments(self): - return [Segment(self.topic_name, bo) for bo in sorted([int(s.split('.')[0], 16) for s in os.listdir(f'data/topics/{self.topic_name}') if s.endswith('.log') and utils.is_number(s.split('.')[0], 16) and len(s.split('.')[0]) == 16])] + return [Segment(self.topic_name, bo) for bo in sorted([int(s.split('.')[0], 16) for s in os.listdir(f'{DATA_DIR}/topics/{self.topic_name}') if s.endswith('.log') and utils.is_number(s.split('.')[0], 16) and len(s.split('.')[0]) == 16])] class Segment: def __init__(self, topic: str, base_offset: int): @@ -609,7 +611,7 @@ class Segment: return f'Segment({self.topic}, {self.base_offset})' def get_base_timestamp(self): - with open(f'data/topics/{self.topic}/{self.base_offset:016x}.timeindex', 'rb') as logfile: + with open(f'{DATA_DIR}/topics/{self.topic}/{self.base_offset:016x}.timeindex', 'rb') as logfile: timestamp_bytes = logfile.read(8) if len(timestamp_bytes) < 8: timestamp = None