From cd3739ee64cf4ba5e4453628d902a2be31b6aff9 Mon Sep 17 00:00:00 2001 From: Fabian Joswig Date: Sun, 29 Mar 2026 18:12:21 +0200 Subject: [PATCH] [Fix] Address edge cases in _deserialize_df and add string dtype tests - Guard against IndexError on empty DataFrames and all-null columns - Use is not None instead of pd.isna() for Obs objects in auto_gamma - Add tests for string dtype columns (with/without None, CSV and SQL) - Add test for empty DataFrame deserialization --- pyerrors/input/pandas.py | 8 +++++-- tests/pandas_test.py | 49 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/pyerrors/input/pandas.py b/pyerrors/input/pandas.py index 54bf14f4..518e7203 100644 --- a/pyerrors/input/pandas.py +++ b/pyerrors/input/pandas.py @@ -169,6 +169,8 @@ def _deserialize_df(df, auto_gamma=False): # In pandas 3+, string columns use 'str' dtype instead of 'object' string_like_dtypes = ["object", "str"] if int(pd.__version__.split(".")[0]) >= 3 else ["object"] for column in df.select_dtypes(include=string_like_dtypes): + if len(df[column]) == 0: + continue if isinstance(df[column].iloc[0], bytes): if df[column].iloc[0].startswith(b"\x1f\x8b\x08\x00"): df[column] = df[column].transform(lambda x: gzip.decompress(x).decode('utf-8') if not pd.isna(x) else '') @@ -178,14 +180,16 @@ def _deserialize_df(df, auto_gamma=False): i = 0 while i < len(df[column]) and pd.isna(df[column].iloc[i]): i += 1 + if i == len(df[column]): + continue if isinstance(df[column].iloc[i], str): if '"program":' in df[column].iloc[i][:20]: df[column] = df[column].transform(lambda x: import_json_string(x, verbose=False) if not pd.isna(x) else None) if auto_gamma is True: if isinstance(df[column].iloc[i], list): - df[column].apply(lambda x: [o.gm() if not pd.isna(o) else x for o in x]) + df[column].apply(lambda x: [o.gm() if o is not None else x for o in x]) else: - df[column].apply(lambda x: x.gm() if not pd.isna(x) else x) + df[column].apply(lambda x: x.gm() if x is not None else x) # Convert NA values back to Python None for compatibility with `x is None` checks if df[column].isna().any(): df[column] = df[column].astype(object).where(df[column].notna(), None) diff --git a/tests/pandas_test.py b/tests/pandas_test.py index f86458f8..e1f0b5f1 100644 --- a/tests/pandas_test.py +++ b/tests/pandas_test.py @@ -244,6 +244,55 @@ def test_sql_if_exists_fail(tmp_path): pe.input.pandas.to_sql(pe_df, "My_table", my_db, if_exists='replace') +def test_string_column_df_export_import(tmp_path): + my_dict = {"str_col": "hello", + "Obs1": pe.pseudo_Obs(87, 21, "test_ensemble")} + my_df = pd.DataFrame([my_dict] * 4) + my_df["str_col"] = my_df["str_col"].astype("string") + for gz in [True, False]: + pe.input.pandas.dump_df(my_df, (tmp_path / 'df_output').as_posix(), gz=gz) + reconstructed_df = pe.input.pandas.load_df((tmp_path / 'df_output').as_posix(), gz=gz) + assert np.all(reconstructed_df["Obs1"] == my_df["Obs1"]) + assert list(reconstructed_df["str_col"]) == list(my_df["str_col"]) + + +def test_string_column_with_none_df_export_import(tmp_path): + my_dict = {"str_col": "hello", + "Obs1": pe.pseudo_Obs(87, 21, "test_ensemble")} + my_df = pd.DataFrame([my_dict] * 4) + my_df["str_col"] = my_df["str_col"].astype("string") + my_df.loc[0, "str_col"] = None + my_df.loc[2, "str_col"] = None + for gz in [True, False]: + pe.input.pandas.dump_df(my_df, (tmp_path / 'df_output').as_posix(), gz=gz) + reconstructed_df = pe.input.pandas.load_df((tmp_path / 'df_output').as_posix(), gz=gz) + assert reconstructed_df.loc[0, "str_col"] is None + assert reconstructed_df.loc[2, "str_col"] is None + assert reconstructed_df.loc[1, "str_col"] == "hello" + assert np.all(reconstructed_df["Obs1"] == my_df["Obs1"]) + + +def test_string_column_sql_export_import(tmp_path): + my_dict = {"str_col": "hello", + "Obs1": pe.pseudo_Obs(87, 21, "test_ensemble")} + my_df = pd.DataFrame([my_dict] * 4) + my_df["str_col"] = my_df["str_col"].astype("string") + my_df.loc[1, "str_col"] = None + my_db = (tmp_path / "test_db.sqlite").as_posix() + pe.input.pandas.to_sql(my_df, "test", my_db) + reconstructed_df = pe.input.pandas.read_sql("SELECT * FROM test", my_db) + assert reconstructed_df.loc[1, "str_col"] is None + assert reconstructed_df.loc[0, "str_col"] == "hello" + assert np.all(reconstructed_df["Obs1"] == my_df["Obs1"]) + + +def test_empty_df_deserialize(): + empty_df = pd.DataFrame({"str_col": pd.Series(dtype="object"), + "int_col": pd.Series(dtype="int64")}) + result = pe.input.pandas._deserialize_df(empty_df) + assert len(result) == 0 + + def test_Obs_list_sql(tmp_path): my_dict = {"int": 1, "Obs1": pe.pseudo_Obs(17, 11, "test_sql_if_exists_failnsemble"),