Merge branch 'develop' into documentation

This commit is contained in:
fjosw 2026-03-29 16:46:30 +00:00
commit cff0b8ee1e
3 changed files with 100 additions and 19 deletions

View file

@ -44,7 +44,7 @@ jobs:
- name: Run tests with -Werror - name: Run tests with -Werror
if: matrix.python-version != '3.14' if: matrix.python-version != '3.14'
run: pytest --cov=pyerrors -vv -Werror run: pytest --cov=pyerrors -vv
- name: Run tests without -Werror for python 3.14 - name: Run tests without -Werror for python 3.14
if: matrix.python-version == '3.14' if: matrix.python-version == '3.14'

View file

@ -145,9 +145,9 @@ def _serialize_df(df, gz=False):
serialize = _need_to_serialize(out[column]) serialize = _need_to_serialize(out[column])
if serialize is True: if serialize is True:
out[column] = out[column].transform(lambda x: create_json_string(x, indent=0) if x is not None else None) out[column] = out[column].transform(lambda x: create_json_string(x, indent=0) if not _is_null(x) else None)
if gz is True: if gz is True:
out[column] = out[column].transform(lambda x: gzip.compress((x if x is not None else '').encode('utf-8'))) out[column] = out[column].transform(lambda x: gzip.compress(x.encode('utf-8')) if not _is_null(x) else gzip.compress(b''))
return out return out
@ -166,37 +166,49 @@ def _deserialize_df(df, auto_gamma=False):
------ ------
In case any column of the DataFrame is gzipped it is gunzipped in the process. In case any column of the DataFrame is gzipped it is gunzipped in the process.
""" """
for column in df.select_dtypes(include="object"): # In pandas 3+, string columns use 'str' dtype instead of 'object'
if isinstance(df[column][0], bytes): string_like_dtypes = ["object", "str"] if int(pd.__version__.split(".")[0]) >= 3 else ["object"]
if df[column][0].startswith(b"\x1f\x8b\x08\x00"): for column in df.select_dtypes(include=string_like_dtypes):
df[column] = df[column].transform(lambda x: gzip.decompress(x).decode('utf-8')) if len(df[column]) == 0:
continue
if isinstance(df[column].iloc[0], bytes):
if df[column].iloc[0].startswith(b"\x1f\x8b\x08\x00"):
df[column] = df[column].transform(lambda x: gzip.decompress(x).decode('utf-8') if not pd.isna(x) else '')
if not all([e is None for e in df[column]]): if df[column].notna().any():
df[column] = df[column].replace({r'^$': None}, regex=True) df[column] = df[column].replace({r'^$': None}, regex=True)
i = 0 i = 0
while df[column][i] is None: while i < len(df[column]) and pd.isna(df[column].iloc[i]):
i += 1 i += 1
if isinstance(df[column][i], str): if i < len(df[column]) and isinstance(df[column].iloc[i], str):
if '"program":' in df[column][i][:20]: if '"program":' in df[column].iloc[i][:20]:
df[column] = df[column].transform(lambda x: import_json_string(x, verbose=False) if x is not None else None) df[column] = df[column].transform(lambda x: import_json_string(x, verbose=False) if not pd.isna(x) else None)
if auto_gamma is True: if auto_gamma is True:
if isinstance(df[column][i], list): if isinstance(df[column].iloc[i], list):
df[column].apply(lambda x: [o.gm() if o is not None else x for o in x]) df[column].apply(lambda x: [o.gm() if o is not None else x for o in x] if x is not None else x)
else: else:
df[column].apply(lambda x: x.gm() if x is not None else x) df[column].apply(lambda x: x.gm() if x is not None else x)
# Convert NA values back to Python None for compatibility with `x is None` checks
if df[column].isna().any():
df[column] = df[column].astype(object).where(df[column].notna(), None)
return df return df
def _need_to_serialize(col): def _need_to_serialize(col):
serialize = False serialize = False
i = 0 i = 0
while i < len(col) and col[i] is None: while i < len(col) and _is_null(col.iloc[i]):
i += 1 i += 1
if i == len(col): if i == len(col):
return serialize return serialize
if isinstance(col[i], (Obs, Corr)): if isinstance(col.iloc[i], (Obs, Corr)):
serialize = True serialize = True
elif isinstance(col[i], list): elif isinstance(col.iloc[i], list):
if all(isinstance(o, Obs) for o in col[i]): if all(isinstance(o, Obs) for o in col.iloc[i]):
serialize = True serialize = True
return serialize return serialize
def _is_null(val):
"""Check if a value is null (None or NA), handling list/array values."""
return False if isinstance(val, (list, np.ndarray)) else pd.isna(val)

View file

@ -244,6 +244,75 @@ def test_sql_if_exists_fail(tmp_path):
pe.input.pandas.to_sql(pe_df, "My_table", my_db, if_exists='replace') pe.input.pandas.to_sql(pe_df, "My_table", my_db, if_exists='replace')
def test_string_column_df_export_import(tmp_path):
my_dict = {"str_col": "hello",
"Obs1": pe.pseudo_Obs(87, 21, "test_ensemble")}
my_df = pd.DataFrame([my_dict] * 4)
my_df["str_col"] = my_df["str_col"].astype("string")
for gz in [True, False]:
pe.input.pandas.dump_df(my_df, (tmp_path / 'df_output').as_posix(), gz=gz)
reconstructed_df = pe.input.pandas.load_df((tmp_path / 'df_output').as_posix(), gz=gz)
assert np.all(reconstructed_df["Obs1"] == my_df["Obs1"])
assert list(reconstructed_df["str_col"]) == list(my_df["str_col"])
def test_string_column_with_none_df_export_import(tmp_path):
my_dict = {"str_col": "hello",
"Obs1": pe.pseudo_Obs(87, 21, "test_ensemble")}
my_df = pd.DataFrame([my_dict] * 4)
my_df["str_col"] = my_df["str_col"].astype("string")
my_df.loc[0, "str_col"] = None
my_df.loc[2, "str_col"] = None
for gz in [True, False]:
pe.input.pandas.dump_df(my_df, (tmp_path / 'df_output').as_posix(), gz=gz)
reconstructed_df = pe.input.pandas.load_df((tmp_path / 'df_output').as_posix(), gz=gz)
assert reconstructed_df.loc[0, "str_col"] is None
assert reconstructed_df.loc[2, "str_col"] is None
assert reconstructed_df.loc[1, "str_col"] == "hello"
assert np.all(reconstructed_df["Obs1"] == my_df["Obs1"])
def test_string_column_sql_export_import(tmp_path):
my_dict = {"str_col": "hello",
"Obs1": pe.pseudo_Obs(87, 21, "test_ensemble")}
my_df = pd.DataFrame([my_dict] * 4)
my_df["str_col"] = my_df["str_col"].astype("string")
my_df.loc[1, "str_col"] = None
my_db = (tmp_path / "test_db.sqlite").as_posix()
pe.input.pandas.to_sql(my_df, "test", my_db)
reconstructed_df = pe.input.pandas.read_sql("SELECT * FROM test", my_db)
assert reconstructed_df.loc[1, "str_col"] is None
assert reconstructed_df.loc[0, "str_col"] == "hello"
assert np.all(reconstructed_df["Obs1"] == my_df["Obs1"])
def test_empty_df_deserialize():
empty_df = pd.DataFrame({"str_col": pd.Series(dtype="object"),
"int_col": pd.Series(dtype="int64")})
result = pe.input.pandas._deserialize_df(empty_df)
assert len(result) == 0
def test_all_empty_string_column():
df = pd.DataFrame({"empty_str": ["", "", "", ""],
"val": [1, 2, 3, 4]})
result = pe.input.pandas._deserialize_df(df)
assert all(result.loc[i, "empty_str"] is None for i in range(4))
def test_Obs_list_with_none_auto_gamma(tmp_path):
obs_list = [pe.pseudo_Obs(0.0, 0.1, "test_ensemble2"), pe.pseudo_Obs(3.2, 1.1, "test_ensemble2")]
my_df = pd.DataFrame({"int": [1, 1, 1],
"Obs1": [pe.pseudo_Obs(17, 11, "test_ensemble")] * 3,
"Obs_list": [obs_list, None, obs_list]})
for gz in [True, False]:
pe.input.pandas.dump_df(my_df, (tmp_path / 'df_output').as_posix(), gz=gz)
re_df = pe.input.pandas.load_df((tmp_path / 'df_output').as_posix(), auto_gamma=True, gz=gz)
assert re_df.loc[1, "Obs_list"] is None
assert len(re_df.loc[0, "Obs_list"]) == 2
assert np.all(re_df["Obs1"] == my_df["Obs1"])
def test_Obs_list_sql(tmp_path): def test_Obs_list_sql(tmp_path):
my_dict = {"int": 1, my_dict = {"int": 1,
"Obs1": pe.pseudo_Obs(17, 11, "test_sql_if_exists_failnsemble"), "Obs1": pe.pseudo_Obs(17, 11, "test_sql_if_exists_failnsemble"),