neue Datei: .gitignore

neue Datei:     README.md
	neue Datei:     pyproject.toml
	neue Datei:     src/jeb_utils/__init__.py
	neue Datei:     src/jeb_utils/auth_utils.py
	neue Datei:     src/jeb_utils/crypto_utils.py
	neue Datei:     src/jeb_utils/exceptions.py
	neue Datei:     src/jeb_utils/jeb_utils.py
	neue Datei:     src/jeb_utils/jebp_utils.py
	neue Datei:     src/jeb_utils/utils.py
This commit is contained in:
2026-02-19 17:03:49 +01:00
commit a0036b6767
10 changed files with 1174 additions and 0 deletions
+649
View File
@@ -0,0 +1,649 @@
import dbm
import pickle
import os
import shutil
import time
import binascii
import io
import warnings
import re
from types import FunctionType, MethodType
from . import utils
from . import exceptions
__all__ = [
'TOPIC_PARTITION_SEPARATOR'
'COMPRESSION_TYPE_NONE',
'COMPRESSION_TYPE_GZIP',
'COMPRESSION_TYPE_SNAPPY',
'COMPRESSION_TYPE_LZ4',
'COMPRESSION_TYPE_ZSTD',
'FETCH_START_TYPE_OFFSET',
'FETCH_START_TYPE_TIMESTAMP',
'FETCH_RECORDS_CHUNK_SIZE',
'SEGMENT_MAX_SIZE',
'format_topic_partitions',
'mktopic',
'rmtopic',
'mktopicpart',
'rmtopicpart',
'get_topics',
'get_segment_base_timestamp',
'get_log_end_offsets',
'init',
'get_next_lower_index_entry',
'validate_topic_partition',
'validate_topic_name',
'validate_topic_format',
'check_topic_exists',
'pack_record_headers',
'unpack_record_headers',
'create_record',
'fetch_records',
'Topic',
'Segment',
'FileCorruptWarning',
'AttributesByte'
]
TOPIC_PARTITION_SEPARATOR = ':'
_TOPIC_NAME_REGEX = re.compile(r"^[a-zA-Z0-9._-]+$")
COMPRESSION_TYPE_NONE = 0
COMPRESSION_TYPE_GZIP = 1
COMPRESSION_TYPE_SNAPPY = 2
COMPRESSION_TYPE_LZ4 = 3
COMPRESSION_TYPE_ZSTD = 4
FETCH_START_TYPE_OFFSET = 0xc0
FETCH_START_TYPE_TIMESTAMP = 0xc1
FETCH_RECORDS_CHUNK_SIZE = 1024 ** 2 # 1 MiB
SEGMENT_MAX_SIZE = None
log_end_offsets = {}
def format_topic_partitions(topic_name: str, partitions: set) -> list:
'''
Formats the topic partitions.
:param topic_name: The name of the topic, e. g. ``test-topic``
:type topic_name: str
:param partitions: The partitions, e. g. ``{1, 2, 4}``
:type partitions: set
:return: The partition names, e. g. ``['test-topic:01', 'test-topic:02', 'test-topic:04']``
:rtype: list
'''
return [f'{topic_name}{TOPIC_PARTITION_SEPARATOR}{p:02d}' for p in partitions]
def mktopic(topic_name: str):
'''
Creates a topic.
:param topic_name: The name of the topic
:type topic_name: str
'''
with dbm.open('data/conf/topics', 'c') as db:
if topic_name.encode() in db.keys():
raise exceptions.TopicExistsError(f'topic \'{topic_name}\' already exists')
db[topic_name] = pickle.dumps({'partitions': {1}})
db.close()
os.mkdir(f'data/topics/{topic_name}{TOPIC_PARTITION_SEPARATOR}01/')
def rmtopic(topic_name: str):
'''
Removes a topic.
:param topic_name: The topic name
:type topic_name: str
:raises TopicNotFoundError: If the topic does not exist.
'''
with dbm.open('data/conf/topics', 'c') as db:
if topic_name.encode() not in db.keys():
raise exceptions.TopicNotFoundError(f'topic \'{topic_name}\' does not exist')
partitions = pickle.loads(db[topic_name.encode()])['partitions']
del db[topic_name]
db.close()
partition_dirs = format_topic_partitions(topic_name, partitions)
for pd in partition_dirs:
try:
shutil.rmtree(f'data/topics/{pd}/')
except:
pass
def mktopicpart(topic_name: str, partition_number: int):
'''
Creates a partition of a topic.
:param topic_name: The topic name
:type topic_name: str
:param partition_number: The partition number
:type partition_number: int
'''
if partition_number < 1 or partition_number > 99:
raise ValueError('partition number must be between 1 and 99')
formatted_partition_number = f'{(int(partition_number)):02d}'
with dbm.open('data/conf/topics', 'c') as db:
if topic_name.encode() not in db.keys():
raise exceptions.exceptions.TopicNotFoundError(f'topic \'{topic_name}\' does not exist')
partitions = pickle.loads(db[topic_name.encode()])['partitions']
if partition_number in partitions:
raise exceptions.TopicPartitionExistsError(f'partition \'{formatted_partition_number}\' already exists for topic \'{topic_name}\'')
db[topic_name] = pickle.dumps({'partitions': partitions | {int(formatted_partition_number)}})
db.close()
os.mkdir(f'data/topics/{topic_name}{TOPIC_PARTITION_SEPARATOR}{formatted_partition_number}/')
def rmtopicpart(topic_name: str, partition_number: int):
'''
Removes a partition of a topic.
:param topic_name: The topic name
:type topic_name: str
:param partition_number: The partition number
:type partition_number: int
'''
formatted_partition_number = f'{(int(partition_number)):02d}'
with dbm.open('data/conf/topics', 'c') as db:
if topic_name.encode() not in db.keys():
raise exceptions.TopicNotFoundError(f'topic \'{topic_name}\' does not exist')
partitions = pickle.loads(db[topic_name.encode()])['partitions']
if partition_number not in partitions:
raise exceptions.TopicPartitionNotFoundError(f'partition \'{formatted_partition_number}\' does not exist for topic \'{topic_name}\'')
db[topic_name] = pickle.dumps({'partitions': partitions - {int(formatted_partition_number)}})
db.close()
shutil.rmtree(f'data/topics/{topic_name}{TOPIC_PARTITION_SEPARATOR}{formatted_partition_number}/')
def get_topics():
'''
Returns all topics.
:return: The topics
:rtype: dict
'''
with dbm.open('data/conf/topics', 'c') as db:
topics = {k.decode(): pickle.loads(v) for k, v in db.items()}
db.close()
return topics
def _create_base_segment_files(topics: dict):
for topic, v in topics.items():
partitions = format_topic_partitions(topic, v['partitions'])
for p in partitions:
utils.create_file_if_not_exists(f'data/topics/{p}/' + '0' * 16 + '.log')
utils.create_file_if_not_exists(f'data/topics/{p}/' + '0' * 16 + '.index')
utils.create_file_if_not_exists(f'data/topics/{p}/' + '0' * 16 + '.timeindex')
def get_segment_base_timestamp(topic: str, segment: int) -> int:
'''
Returns the base timestamp of a segment.
:param topic: The topic
:type topic: str
:param segment: The segment base offset
:type segment: int
:return: The base timestamp (milliseconds since 1970)
:rtype: int
'''
with open(f'data/topics/{topic}/{segment:016x}.log', 'rb') as logfile:
# go to the topic length
logfile.seek(20)
# skip topic
topic_length = int.from_bytes(logfile.read(2))
logfile.seek(topic_length + 1, os.SEEK_CUR)
timestamp_bytes = logfile.read(8)
if len(timestamp_bytes) < 8:
timestamp = None
else:
timestamp = int.from_bytes(timestamp_bytes)
return timestamp
def get_log_end_offsets(topics: dict):
_topics = topics
topics = [f'{t}{TOPIC_PARTITION_SEPARATOR}{p:02d}' for t, v in _topics.items() for p in v['partitions']]
last_topics_segments = {t: sorted([s for s in os.listdir(f'data/topics/{t}') if s.endswith('.log') and utils.is_number(s.split('.')[0], 16) and len(s.split('.')[0]) == 16])[-1] for t in topics}
topic_segments = {t: Topic(t).segments for t in topics}
for topic, segments in topic_segments.items():
log_end_offset = 0
for segment in segments:
with open(f'data/topics/{topic}/{segment.base_offset:016x}.log', 'rb') as logfile:
logfile.seek(0, os.SEEK_END)
file_size = logfile.tell()
logfile.seek(0)
while True:
record_size_bytes = logfile.read(8)
if not record_size_bytes or len(record_size_bytes) < 8:
break
record_size = int.from_bytes(record_size_bytes)
if logfile.tell() + record_size > file_size:
break
logfile.seek(record_size, os.SEEK_CUR)
log_end_offset += 1
logfile.close()
log_end_offsets[topic] = log_end_offset
def init(config: dict = {'segment_max_size': 1024 * 256}):
'''
Initializes the module. Calling this function is required before using most of the functions of the module.
:param config: The configuration
:type config: dict
'''
global SEGMENT_MAX_SIZE
SEGMENT_MAX_SIZE = config.get('segment_max_size', SEGMENT_MAX_SIZE)
topics = get_topics()
# segments = {t: sorted([int(s.split('.')[0], 16) for s in os.listdir(f'data/topics/{t}') if s.endswith('.log') and utils.is_number(s.split('.')[0], 16) and len(s.split('.')[0]) == 16]) for t in [f'{t}{TOPIC_PARTITION_SEPARATOR}{p:02d}' for t, v in topics.items() for p in v['partitions']]}
# base_timestamps = {t: {s: Segment(t, s).base_timestamp for s in ss} for t, ss in segments.items()}
_create_base_segment_files(topics)
get_log_end_offsets(topics)
def get_next_lower_index_entry(offset: int):
return utils.get_next_lower_integer_multiple(offset, 1024)
def validate_topic_partition(partition_number: str | int) -> bool:
'''
Validates a topic partition number format.
:param partition_number: The partition number
:type partition_number: str | int
:return: ``True``, if the partition number is valid, otherwise ``False``
:rtype: bool
'''
if isinstance(partition_number, str):
if len(partition_number) != 2 or not partition_number.isdigit():
return False
partition_number = int(partition_number)
elif not isinstance(partition_number, int):
return False
if not (0 < partition_number < 100):
return False
return True
def validate_topic_name(topic_name: str) -> bool:
'''
Validates a topic name.
:param topic_name: The topic name
:type topic_name: str
:return: ``True``, if the topic name is valid, otherwise ``False``
:rtype: bool
'''
if not isinstance(topic_name, str):
return False
if not _TOPIC_NAME_REGEX.match(topic_name):
return False
if len(topic_name) == 0 or len(topic_name) > 255:
return False
if topic_name.startswith('.') or topic_name.endswith('.') or topic_name.startswith('..') or topic_name.endswith('..'):
return False
def validate_topic_format(topic: str):
if topic.count(TOPIC_PARTITION_SEPARATOR) != 1:
raise ValueError('invalid topic format')
_, formatted_partition_number = topic.split(TOPIC_PARTITION_SEPARATOR)
if len(formatted_partition_number) != 2 or not formatted_partition_number.isdigit():
raise ValueError('invalid partition number format')
def check_topic_exists(topic: str):
'''
Checks whether a topic exists.
:param topic: The topic name
:type topic: str
:raises TopicNotFoundError: If the topic does not exist
:raises TopicPartitionNotFoundError: If the partition does not exist
'''
topics = get_topics()
topic_name, formatted_partition_number = topic.split(TOPIC_PARTITION_SEPARATOR)
raw_partition_number = int(formatted_partition_number)
if topic_name not in topics:
raise exceptions.TopicNotFoundError(f'topic \'{topic_name}\' does not exist')
if raw_partition_number not in topics[topic_name]['partitions']:
raise exceptions.TopicPartitionNotFoundError(f'partition \'{formatted_partition_number}\' does not exist for topic \'{topic_name}\'')
def pack_record_headers(headers: dict) -> bytes:
'''
Packs the record header dictionary into bytes.
:param headers: The header dictionary
:type headers: dict
:return: The bytes
:rtype: bytes
'''
packed = b''
headers = {(k.encode() if isinstance(k, str) else k): (v.encode() if isinstance(v, str) else v) for k, v in headers.items()}
for k, v in headers.items():
packed += len(k).to_bytes(4)
packed += k
packed += len(v).to_bytes(4)
packed += v
return packed
def _unpack_record_headers(buffer: io.BytesIO, number: int = 256):
headers = {}
for _ in range(number):
key_length_bytes = buffer.read(4)
if len(key_length_bytes) < 4:
break
key_length = int.from_bytes(key_length_bytes)
key = buffer.read(key_length)
if len(key) < key_length:
break
value_length_bytes = buffer.read(4)
if len(value_length_bytes) < 4:
break
value_length = int.from_bytes(value_length_bytes)
value = buffer.read(value_length)
if len(value) < value_length:
break
headers[key] = value
return headers
def unpack_record_headers(packed: bytes) -> dict:
'''
Unpacks the record headers from bytes.
:param packed: The packed headers
:type packed: bytes
:return: The header dictionary
:rtype: dict
'''
buffer = io.BytesIO(packed)
return _unpack_record_headers(buffer)
def create_record(topic: str, timestamp: int, record_data: bytes, key: bytes = b'', compression_type: int = 0, headers: dict = {}):
'''
Creates a record.
:param topic: The topic name
:type topic: str
:param timestamp: The timestamp.
:type timestamp: int
:param record_data: The payload
:type record_data: bytes
:param key: The key
:type key: bytes
:param compression_type: The compression type
:type compression_type: int
:param headers: The header dictionary
:type headers: dict
'''
global log_end_offsets
validate_topic_format(topic)
topic_name, formatted_partition_number = topic.split(TOPIC_PARTITION_SEPARATOR)
check_topic_exists(topic)
topic_dir = f'data/topics/{topic_name}{TOPIC_PARTITION_SEPARATOR}{formatted_partition_number}/'
utils.create_file_if_not_exists(topic_dir + '0' * 16 + '.log')
segments = sorted([int(f.split('.')[0], 16) for f in os.listdir(topic_dir) if f.endswith('.log') and utils.is_number(f.split('.')[0], 16) and len(f.split('.')[0]) == 16])
last_segment = max(segments)
if timestamp == 0:
timestamp = int(time.time() * 1000)
timestamp_type = 1
else:
timestamp_type = 0
# Assemble record
record = b''
record += log_end_offsets[topic].to_bytes(8) # Offset
record += binascii.crc32(record_data).to_bytes(4) # CRC
record += (len(topic)).to_bytes(2) # Topic name length
record += topic.encode() # Topic
record += AttributesByte(compression_type, timestamp_type).to_byte() # Attributes
record += timestamp.to_bytes(8) # Timestamp
record += (len(key)).to_bytes(4) # Key Length
record += key # Key
record += (len(record_data)).to_bytes(4) # Content Length
record += record_data # Content
record += (len(headers)).to_bytes(1)
for key, value in headers.items():
record += len(key).to_bytes(4)
record += key
record += len(value).to_bytes(4)
record += value
record = len(record).to_bytes(8) + record # Prepend Record Size
if os.path.getsize(topic_dir + f'{last_segment:016x}.log') + len(record) > SEGMENT_MAX_SIZE:
last_segment = log_end_offsets[topic]
with open(topic_dir + f'{last_segment:016x}.log', 'ab') as logfile:
logfile.write(record)
logfile.close()
utils.create_file_if_not_exists(topic_dir + f'{last_segment:016x}.index')
utils.create_file_if_not_exists(topic_dir + f'{last_segment:016x}.timeindex')
if log_end_offsets[topic] % 1024 == 0:
index = log_end_offsets[topic].to_bytes(8) + os.path.getsize(topic_dir + f'{last_segment:016x}.log').to_bytes(4)
timeindex = timestamp.to_bytes(8) + log_end_offsets[topic].to_bytes(8)
with open(topic_dir + f'{last_segment:016x}.index', 'ab') as idxfile:
idxfile.write(index)
idxfile.close()
with open(topic_dir + f'{last_segment:016x}.timeindex', 'ab') as timeidxfile:
timeidxfile.write(timeindex)
timeidxfile.close()
log_end_offsets[topic] += 1
async def fetch_records(topic: str, start_type: int, start: int, max_bytes: int, send_block_function: FunctionType | MethodType):
'''
Fetches records from the topic and sends it using ``send_block_function``
:param topic: The topic name
:type topic: str
:param start_type: The start type
:type start_type: int
:param start: The start
:type start: int
:param max_bytes: The maximum of bytes
:type max_bytes: int
:param send_block_function: The function to send the chunks
:type send_block_function: FunctionType | MethodType
'''
topic = Topic(topic)
topics = get_topics()
validate_topic_format(topic.topic_name)
topic_name, formatted_partition_number = topic.topic_name.split(TOPIC_PARTITION_SEPARATOR)
raw_partition_number = int(formatted_partition_number)
if start_type not in (FETCH_START_TYPE_OFFSET, FETCH_START_TYPE_TIMESTAMP):
raise ValueError('invalid start type')
check_topic_exists(topic.topic_name)
segments = [int(f.split('.')[0], 16) for f in os.listdir(f'data/topics/{topic.topic_name}/') if utils.is_number(f.split('.')[0], 16) and len(f.split('.')[0]) == 16 and f.endswith('.log')]
if start_type == FETCH_START_TYPE_TIMESTAMP:
segment_base_timestamps = {s: s.base_timestamp for s in topic.segments}
s = {v: k for k, v in segment_base_timestamps.items()}[utils.find_nearest_lower_number(list(segment_base_timestamps.values()), start) or min(segment_base_timestamps.values())]
with open(f'data/topics/{topic.topic_name}/{s.base_offset:016x}.timeindex', 'rb') as f:
timestamps = {}
f.seek(0)
while True:
timestamp_bytes = f.read(8)
if len(timestamp_bytes) < 8:
break
timestamp = int.from_bytes(timestamp_bytes)
offset_bytes = f.read(8)
if len(offset_bytes) < 8:
break
offset = int.from_bytes(offset_bytes)
timestamps[timestamp] = offset
f.close()
start = timestamps[utils.find_nearest_lower_number(timestamps.keys(), start) or min(timestamps.keys())]
if start < 0 or start >= log_end_offsets[topic.topic_name]:
await send_block_function(b'\xb1\x31')
return
idx_offset = get_next_lower_index_entry(start)
start_seg = utils.find_nearest_lower_number(segments, start)
with open(f'data/topics/{topic.topic_name}/' + f'{start_seg:016x}' + '.index', 'rb') as idxfile:
idxfile.seek(0, os.SEEK_END)
file_size = idxfile.tell()
idxfile.seek(idx_offset // 1024 * 12)
index_entry = idxfile.read(12)
idx_entry_pos = int.from_bytes(index_entry[8:])
if int.from_bytes(index_entry[:8]) != idx_offset:
warnings.warn(f'Index entry offset mismatch: expected {idx_offset}, got {int.from_bytes(index_entry[:8])}', FileCorruptWarning)
idxfile.close()
with open(f'data/topics/{topic.topic_name}/{start_seg:016x}.log', 'rb') as logfile:
logfile.seek(idx_entry_pos)
offset = idx_offset
record_position = 0
while offset < start:
record_length_bytes = logfile.read(8)
if len(record_length_bytes) < 8:
break
record_length = int.from_bytes(record_length_bytes)
offset_bytes = logfile.read(8)
if len(offset_bytes) < 8:
break
offset = int.from_bytes(offset_bytes)
record_position = logfile.tell() - 16
logfile.seek(record_length - 8, os.SEEK_CUR)
logfile.close()
await send_block_function(b'\xa0\x30')
segments = sorted(segments)
bytes_fetched = 0
print(max_bytes)
for i, segment in enumerate(segments[segments.index(start_seg):]):
records_sent = 0
with open(f'data/topics/{topic.topic_name}/' + f'{segment:016x}' + '.log', 'rb') as logfile:
if i == 0:
logfile.seek(record_position)
else:
logfile.seek(0)
while bytes_fetched <= max_bytes:
record_size_bytes = logfile.read(8)
if len(record_size_bytes) < 8:
break
record_size = int.from_bytes(record_size_bytes)
if bytes_fetched + 8 + record_size > max_bytes:
break
to_send = record_size_bytes
for _ in range(record_size // FETCH_RECORDS_CHUNK_SIZE):
to_send += logfile.read(FETCH_RECORDS_CHUNK_SIZE)
await send_block_function(b'\x01' + to_send)
to_send = b''
to_send += logfile.read(record_size % FETCH_RECORDS_CHUNK_SIZE)
await send_block_function(b'\x01' + to_send)
bytes_fetched += 8 + record_size
records_sent += 1
logfile.close()
if bytes_fetched >= max_bytes:
break
await send_block_function(b'\x00') # EOF
class Topic:
def __init__(self, topic_name: str):
self.topic_name = topic_name
def __repr__(self):
return f'Topic({self.topic_name})'
@property
def segments(self):
return [Segment(self.topic_name, bo) for bo in sorted([int(s.split('.')[0], 16) for s in os.listdir(f'data/topics/{self.topic_name}') if s.endswith('.log') and utils.is_number(s.split('.')[0], 16) and len(s.split('.')[0]) == 16])]
class Segment:
def __init__(self, topic: str, base_offset: int):
self.topic = topic
self.base_offset = base_offset
self.base_timestamp = self.get_base_timestamp()
def __repr__(self):
return f'Segment({self.topic}, {self.base_offset})'
def get_base_timestamp(self):
with open(f'data/topics/{self.topic}/{self.base_offset:016x}.timeindex', 'rb') as logfile:
timestamp_bytes = logfile.read(8)
if len(timestamp_bytes) < 8:
timestamp = None
else:
timestamp = int.from_bytes(timestamp_bytes)
return timestamp
class FileCorruptWarning(UserWarning): ...
class AttributesByte:
def __init__(self, compression_type: int, timestamp_type: int):
self.compression_type = compression_type
self.timestamp_type = timestamp_type
def to_byte(self):
return utils.bits_to_byte(
0, 0, 0, 0, # Unused bits
self.timestamp_type,
(self.compression_type >> 2) & 1,
(self.compression_type >> 1) & 1,
self.compression_type & 1,
)
@classmethod
def from_byte(cls, b: bytes | int):
if isinstance(b, int):
b = bytes([b])
bits = utils.byte_to_bits(b)
compression_type = utils.bits_to_byte(*bits[-3:])[0]
if compression_type > 4:
raise ValueError('invalid compression type in attributes byte')
timestamp_type = bits[-4]
if timestamp_type > 1:
raise ValueError('invalid timestamp type in attributes byte')
return cls(compression_type, timestamp_type)