[Fix] Pandas 3 string type changes (#278)

* [Fix] Pandas 3 string type handling * [ci] Temporarily remove WError because of scipy deprecation * [Fix] Address edge cases in _deserialize_df - Use pd.isna() instead of truthiness check for gzip null guard, fixing incorrect behavior when null is np.nan (which is truthy in Python) - Add bounds check to while loop to prevent IndexError when all non-null values are empty strings converted to None by regex replace * [Fix] Address edge cases in _deserialize_df and add string dtype tests - Guard against IndexError on empty DataFrames and all-null columns - Use is not None instead of pd.isna() for Obs objects in auto_gamma - Add tests for string dtype columns (with/without None, CSV and SQL) - Add test for empty DataFrame deserialization * [Fix] Avoid skipping NA-to-None conversion and guard auto_gamma against None lists - Replace continue with conditional to preserve NA-to-None conversion for all-null columns - Guard auto_gamma list lambda against None values to prevent TypeError - Add tests for all-empty-string columns and Obs lists with None + auto_gamma
2026-05-14 17:16:54 +02:00 · 2026-03-29 18:46:15 +02:00 · 2026-03-29 18:46:15 +02:00 · 682d23604d
commit 682d23604d
parent da399b7c02
3 changed files with 100 additions and 19 deletions
--- a/tests/pandas_test.py
+++ b/tests/pandas_test.py
@ -244,6 +244,75 @@ def test_sql_if_exists_fail(tmp_path):
    pe.input.pandas.to_sql(pe_df, "My_table", my_db, if_exists='replace')


+def test_string_column_df_export_import(tmp_path):
+    my_dict = {"str_col": "hello",
+               "Obs1": pe.pseudo_Obs(87, 21, "test_ensemble")}
+    my_df = pd.DataFrame([my_dict] * 4)
+    my_df["str_col"] = my_df["str_col"].astype("string")
+    for gz in [True, False]:
+        pe.input.pandas.dump_df(my_df, (tmp_path / 'df_output').as_posix(), gz=gz)
+        reconstructed_df = pe.input.pandas.load_df((tmp_path / 'df_output').as_posix(), gz=gz)
+        assert np.all(reconstructed_df["Obs1"] == my_df["Obs1"])
+        assert list(reconstructed_df["str_col"]) == list(my_df["str_col"])
+
+
+def test_string_column_with_none_df_export_import(tmp_path):
+    my_dict = {"str_col": "hello",
+               "Obs1": pe.pseudo_Obs(87, 21, "test_ensemble")}
+    my_df = pd.DataFrame([my_dict] * 4)
+    my_df["str_col"] = my_df["str_col"].astype("string")
+    my_df.loc[0, "str_col"] = None
+    my_df.loc[2, "str_col"] = None
+    for gz in [True, False]:
+        pe.input.pandas.dump_df(my_df, (tmp_path / 'df_output').as_posix(), gz=gz)
+        reconstructed_df = pe.input.pandas.load_df((tmp_path / 'df_output').as_posix(), gz=gz)
+        assert reconstructed_df.loc[0, "str_col"] is None
+        assert reconstructed_df.loc[2, "str_col"] is None
+        assert reconstructed_df.loc[1, "str_col"] == "hello"
+        assert np.all(reconstructed_df["Obs1"] == my_df["Obs1"])
+
+
+def test_string_column_sql_export_import(tmp_path):
+    my_dict = {"str_col": "hello",
+               "Obs1": pe.pseudo_Obs(87, 21, "test_ensemble")}
+    my_df = pd.DataFrame([my_dict] * 4)
+    my_df["str_col"] = my_df["str_col"].astype("string")
+    my_df.loc[1, "str_col"] = None
+    my_db = (tmp_path / "test_db.sqlite").as_posix()
+    pe.input.pandas.to_sql(my_df, "test", my_db)
+    reconstructed_df = pe.input.pandas.read_sql("SELECT * FROM test", my_db)
+    assert reconstructed_df.loc[1, "str_col"] is None
+    assert reconstructed_df.loc[0, "str_col"] == "hello"
+    assert np.all(reconstructed_df["Obs1"] == my_df["Obs1"])
+
+
+def test_empty_df_deserialize():
+    empty_df = pd.DataFrame({"str_col": pd.Series(dtype="object"),
+                              "int_col": pd.Series(dtype="int64")})
+    result = pe.input.pandas._deserialize_df(empty_df)
+    assert len(result) == 0
+
+
+def test_all_empty_string_column():
+    df = pd.DataFrame({"empty_str": ["", "", "", ""],
+                        "val": [1, 2, 3, 4]})
+    result = pe.input.pandas._deserialize_df(df)
+    assert all(result.loc[i, "empty_str"] is None for i in range(4))
+
+
+def test_Obs_list_with_none_auto_gamma(tmp_path):
+    obs_list = [pe.pseudo_Obs(0.0, 0.1, "test_ensemble2"), pe.pseudo_Obs(3.2, 1.1, "test_ensemble2")]
+    my_df = pd.DataFrame({"int": [1, 1, 1],
+                           "Obs1": [pe.pseudo_Obs(17, 11, "test_ensemble")] * 3,
+                           "Obs_list": [obs_list, None, obs_list]})
+    for gz in [True, False]:
+        pe.input.pandas.dump_df(my_df, (tmp_path / 'df_output').as_posix(), gz=gz)
+        re_df = pe.input.pandas.load_df((tmp_path / 'df_output').as_posix(), auto_gamma=True, gz=gz)
+        assert re_df.loc[1, "Obs_list"] is None
+        assert len(re_df.loc[0, "Obs_list"]) == 2
+        assert np.all(re_df["Obs1"] == my_df["Obs1"])
+
+
 def test_Obs_list_sql(tmp_path):
    my_dict = {"int": 1,
               "Obs1": pe.pseudo_Obs(17, 11, "test_sql_if_exists_failnsemble"),