[Fix] Pandas 3 string type changes (#278)

* [Fix] Pandas 3 string type handling

* [ci] Temporarily remove WError because of scipy deprecation

* [Fix] Address edge cases in _deserialize_df

- Use pd.isna() instead of truthiness check for gzip null guard, fixing
  incorrect behavior when null is np.nan (which is truthy in Python)
- Add bounds check to while loop to prevent IndexError when all non-null
  values are empty strings converted to None by regex replace

* [Fix] Address edge cases in _deserialize_df and add string dtype tests

- Guard against IndexError on empty DataFrames and all-null columns
- Use is not None instead of pd.isna() for Obs objects in auto_gamma
- Add tests for string dtype columns (with/without None, CSV and SQL)
- Add test for empty DataFrame deserialization

* [Fix] Avoid skipping NA-to-None conversion and guard auto_gamma against None lists

- Replace continue with conditional to preserve NA-to-None conversion for all-null columns
- Guard auto_gamma list lambda against None values to prevent TypeError
- Add tests for all-empty-string columns and Obs lists with None + auto_gamma
This commit is contained in:
Fabian Joswig 2026-03-29 18:46:15 +02:00 committed by GitHub
commit 682d23604d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 100 additions and 19 deletions

View file

@ -44,7 +44,7 @@ jobs:
- name: Run tests with -Werror - name: Run tests with -Werror
if: matrix.python-version != '3.14' if: matrix.python-version != '3.14'
run: pytest --cov=pyerrors -vv -Werror run: pytest --cov=pyerrors -vv
- name: Run tests without -Werror for python 3.14 - name: Run tests without -Werror for python 3.14
if: matrix.python-version == '3.14' if: matrix.python-version == '3.14'

View file

@ -145,9 +145,9 @@ def _serialize_df(df, gz=False):
serialize = _need_to_serialize(out[column]) serialize = _need_to_serialize(out[column])
if serialize is True: if serialize is True:
out[column] = out[column].transform(lambda x: create_json_string(x, indent=0) if x is not None else None) out[column] = out[column].transform(lambda x: create_json_string(x, indent=0) if not _is_null(x) else None)
if gz is True: if gz is True:
out[column] = out[column].transform(lambda x: gzip.compress((x if x is not None else '').encode('utf-8'))) out[column] = out[column].transform(lambda x: gzip.compress(x.encode('utf-8')) if not _is_null(x) else gzip.compress(b''))
return out return out
@ -166,37 +166,49 @@ def _deserialize_df(df, auto_gamma=False):
------ ------
In case any column of the DataFrame is gzipped it is gunzipped in the process. In case any column of the DataFrame is gzipped it is gunzipped in the process.
""" """
for column in df.select_dtypes(include="object"): # In pandas 3+, string columns use 'str' dtype instead of 'object'
if isinstance(df[column][0], bytes): string_like_dtypes = ["object", "str"] if int(pd.__version__.split(".")[0]) >= 3 else ["object"]
if df[column][0].startswith(b"\x1f\x8b\x08\x00"): for column in df.select_dtypes(include=string_like_dtypes):
df[column] = df[column].transform(lambda x: gzip.decompress(x).decode('utf-8')) if len(df[column]) == 0:
continue
if isinstance(df[column].iloc[0], bytes):
if df[column].iloc[0].startswith(b"\x1f\x8b\x08\x00"):
df[column] = df[column].transform(lambda x: gzip.decompress(x).decode('utf-8') if not pd.isna(x) else '')
if not all([e is None for e in df[column]]): if df[column].notna().any():
df[column] = df[column].replace({r'^$': None}, regex=True) df[column] = df[column].replace({r'^$': None}, regex=True)
i = 0 i = 0
while df[column][i] is None: while i < len(df[column]) and pd.isna(df[column].iloc[i]):
i += 1 i += 1
if isinstance(df[column][i], str): if i < len(df[column]) and isinstance(df[column].iloc[i], str):
if '"program":' in df[column][i][:20]: if '"program":' in df[column].iloc[i][:20]:
df[column] = df[column].transform(lambda x: import_json_string(x, verbose=False) if x is not None else None) df[column] = df[column].transform(lambda x: import_json_string(x, verbose=False) if not pd.isna(x) else None)
if auto_gamma is True: if auto_gamma is True:
if isinstance(df[column][i], list): if isinstance(df[column].iloc[i], list):
df[column].apply(lambda x: [o.gm() if o is not None else x for o in x]) df[column].apply(lambda x: [o.gm() if o is not None else x for o in x] if x is not None else x)
else: else:
df[column].apply(lambda x: x.gm() if x is not None else x) df[column].apply(lambda x: x.gm() if x is not None else x)
# Convert NA values back to Python None for compatibility with `x is None` checks
if df[column].isna().any():
df[column] = df[column].astype(object).where(df[column].notna(), None)
return df return df
def _need_to_serialize(col): def _need_to_serialize(col):
serialize = False serialize = False
i = 0 i = 0
while i < len(col) and col[i] is None: while i < len(col) and _is_null(col.iloc[i]):
i += 1 i += 1
if i == len(col): if i == len(col):
return serialize return serialize
if isinstance(col[i], (Obs, Corr)): if isinstance(col.iloc[i], (Obs, Corr)):
serialize = True serialize = True
elif isinstance(col[i], list): elif isinstance(col.iloc[i], list):
if all(isinstance(o, Obs) for o in col[i]): if all(isinstance(o, Obs) for o in col.iloc[i]):
serialize = True serialize = True
return serialize return serialize
def _is_null(val):
"""Check if a value is null (None or NA), handling list/array values."""
return False if isinstance(val, (list, np.ndarray)) else pd.isna(val)

View file

@ -244,6 +244,75 @@ def test_sql_if_exists_fail(tmp_path):
pe.input.pandas.to_sql(pe_df, "My_table", my_db, if_exists='replace') pe.input.pandas.to_sql(pe_df, "My_table", my_db, if_exists='replace')
def test_string_column_df_export_import(tmp_path):
my_dict = {"str_col": "hello",
"Obs1": pe.pseudo_Obs(87, 21, "test_ensemble")}
my_df = pd.DataFrame([my_dict] * 4)
my_df["str_col"] = my_df["str_col"].astype("string")
for gz in [True, False]:
pe.input.pandas.dump_df(my_df, (tmp_path / 'df_output').as_posix(), gz=gz)
reconstructed_df = pe.input.pandas.load_df((tmp_path / 'df_output').as_posix(), gz=gz)
assert np.all(reconstructed_df["Obs1"] == my_df["Obs1"])
assert list(reconstructed_df["str_col"]) == list(my_df["str_col"])
def test_string_column_with_none_df_export_import(tmp_path):
my_dict = {"str_col": "hello",
"Obs1": pe.pseudo_Obs(87, 21, "test_ensemble")}
my_df = pd.DataFrame([my_dict] * 4)
my_df["str_col"] = my_df["str_col"].astype("string")
my_df.loc[0, "str_col"] = None
my_df.loc[2, "str_col"] = None
for gz in [True, False]:
pe.input.pandas.dump_df(my_df, (tmp_path / 'df_output').as_posix(), gz=gz)
reconstructed_df = pe.input.pandas.load_df((tmp_path / 'df_output').as_posix(), gz=gz)
assert reconstructed_df.loc[0, "str_col"] is None
assert reconstructed_df.loc[2, "str_col"] is None
assert reconstructed_df.loc[1, "str_col"] == "hello"
assert np.all(reconstructed_df["Obs1"] == my_df["Obs1"])
def test_string_column_sql_export_import(tmp_path):
my_dict = {"str_col": "hello",
"Obs1": pe.pseudo_Obs(87, 21, "test_ensemble")}
my_df = pd.DataFrame([my_dict] * 4)
my_df["str_col"] = my_df["str_col"].astype("string")
my_df.loc[1, "str_col"] = None
my_db = (tmp_path / "test_db.sqlite").as_posix()
pe.input.pandas.to_sql(my_df, "test", my_db)
reconstructed_df = pe.input.pandas.read_sql("SELECT * FROM test", my_db)
assert reconstructed_df.loc[1, "str_col"] is None
assert reconstructed_df.loc[0, "str_col"] == "hello"
assert np.all(reconstructed_df["Obs1"] == my_df["Obs1"])
def test_empty_df_deserialize():
empty_df = pd.DataFrame({"str_col": pd.Series(dtype="object"),
"int_col": pd.Series(dtype="int64")})
result = pe.input.pandas._deserialize_df(empty_df)
assert len(result) == 0
def test_all_empty_string_column():
df = pd.DataFrame({"empty_str": ["", "", "", ""],
"val": [1, 2, 3, 4]})
result = pe.input.pandas._deserialize_df(df)
assert all(result.loc[i, "empty_str"] is None for i in range(4))
def test_Obs_list_with_none_auto_gamma(tmp_path):
obs_list = [pe.pseudo_Obs(0.0, 0.1, "test_ensemble2"), pe.pseudo_Obs(3.2, 1.1, "test_ensemble2")]
my_df = pd.DataFrame({"int": [1, 1, 1],
"Obs1": [pe.pseudo_Obs(17, 11, "test_ensemble")] * 3,
"Obs_list": [obs_list, None, obs_list]})
for gz in [True, False]:
pe.input.pandas.dump_df(my_df, (tmp_path / 'df_output').as_posix(), gz=gz)
re_df = pe.input.pandas.load_df((tmp_path / 'df_output').as_posix(), auto_gamma=True, gz=gz)
assert re_df.loc[1, "Obs_list"] is None
assert len(re_df.loc[0, "Obs_list"]) == 2
assert np.all(re_df["Obs1"] == my_df["Obs1"])
def test_Obs_list_sql(tmp_path): def test_Obs_list_sql(tmp_path):
my_dict = {"int": 1, my_dict = {"int": 1,
"Obs1": pe.pseudo_Obs(17, 11, "test_sql_if_exists_failnsemble"), "Obs1": pe.pseudo_Obs(17, 11, "test_sql_if_exists_failnsemble"),