db_reader memory usage reduced by using an iterator instead of split

2025-06-17 22:56:43 +02:00 · 2025-06-17 22:56:43 +02:00 · 9e7c9a47b3
parent 412827226f
commit 9e7c9a47b3
1 changed files with 33 additions and 21 deletions
--- a/src/gaspype/_phys_data.py
+++ b/src/gaspype/_phys_data.py
@ -1,32 +1,44 @@
 import struct
 from typing import Generator, Iterator
 from dataclasses import dataclass
-class SpeciesData():
+def split_on_space(data: bytes, offset: int, end: int) -> Iterator[str]:
    """Splits a byte array into ASCII strings based on spaces.
    Args:
        data: The byte array to split.
        offset: The starting index in the byte array.
        end: The ending index in the byte array.
    Yields:
        str: ASCII strings found in the byte array.
    """
    start = offset
    for i in range(offset, end):
        if data[i] == 0x20:  # ASCII space character
            if start < i:
                yield data[start:i].decode('ascii')
            start = i + 1
    if start < end:
        yield data[start:end].decode('ascii')
@dataclass
 class SpeciesData:
    """Class to hold the physical data for a species.
    Attributes:
-        comp: Dictionary of species composition with element symbols as keys and their counts as values.
+        name: Name of the species.
        composition: Dictionary of species composition with element symbols as keys and their counts as values.
        model: Number of polynomial coefficients used in the model.
        ref_string: Reference string for the data source.
        t_range: List of temperatures nodes marking intervals.
        data: List of lists containing physical data for each temperature interval.
    """
-
+    name: str
-    def __init__(self, name: str, comp: dict[str, int], model: int, ref: str, t_range: list[float], data: list[list[float]]):
+    composition: dict[str, int]
-        self.name = name
+    model: int
-        self.composition: dict[str, int] = comp
+    ref_string: str
-        self.model: int = model
+    t_range: list[float]
-        self.ref_string: str = ref
+    data: list[list[float]]
        self.t_range: list[float] = t_range
        self.data: list[list[float]] = data
    def __repr__(self) -> str:
        return (f"Name: {self.name}\n" +
                f"Composition: {self.composition}\n" +
                f"Model: {self.model}\n" +
                f"Reference: {self.ref_string}\n" +
                f"Temperatures: {self.t_range}\n" +
                f"Data: {self.data}".replace('),', '),\n'))
 class db_reader():
@ -45,8 +57,8 @@ class db_reader():
        """
        assert inp_data[:4] == b'gapy', 'Unknown data format'
        self._bin_data = inp_data
-        self._name_count = struct.unpack('<I', self._bin_data[4:8])[0]
+        self._name_lengths = struct.unpack('<I', self._bin_data[4:8])[0]
-        species_names = self._bin_data[db_reader.header_len:(db_reader.header_len + self._name_count)].decode('ASCII').split(' ')
+        species_names = split_on_space(self._bin_data, db_reader.header_len, db_reader.header_len + self._name_lengths)
        self._index = {s: i for i, s in enumerate(species_names)}
    @property
@ -82,7 +94,7 @@ class db_reader():
        if name not in self._index:
            return None
-        head_offset = self._name_count + db_reader.header_len + self._index[name] * db_reader.header_len
+        head_offset = self._name_lengths + db_reader.header_len + self._index[name] * db_reader.header_len
        head = struct.unpack('<I4B', self._bin_data[head_offset:head_offset + db_reader.header_len])