mirror of
https://github.com/fjosw/pyerrors.git
synced 2026-04-14 16:51:38 +02:00
[Fix] Address edge cases in _deserialize_df and add string dtype tests
- Guard against IndexError on empty DataFrames and all-null columns - Use is not None instead of pd.isna() for Obs objects in auto_gamma - Add tests for string dtype columns (with/without None, CSV and SQL) - Add test for empty DataFrame deserialization
This commit is contained in:
parent
69a33c80c1
commit
cd3739ee64
2 changed files with 55 additions and 2 deletions
|
|
@ -169,6 +169,8 @@ def _deserialize_df(df, auto_gamma=False):
|
|||
# In pandas 3+, string columns use 'str' dtype instead of 'object'
|
||||
string_like_dtypes = ["object", "str"] if int(pd.__version__.split(".")[0]) >= 3 else ["object"]
|
||||
for column in df.select_dtypes(include=string_like_dtypes):
|
||||
if len(df[column]) == 0:
|
||||
continue
|
||||
if isinstance(df[column].iloc[0], bytes):
|
||||
if df[column].iloc[0].startswith(b"\x1f\x8b\x08\x00"):
|
||||
df[column] = df[column].transform(lambda x: gzip.decompress(x).decode('utf-8') if not pd.isna(x) else '')
|
||||
|
|
@ -178,14 +180,16 @@ def _deserialize_df(df, auto_gamma=False):
|
|||
i = 0
|
||||
while i < len(df[column]) and pd.isna(df[column].iloc[i]):
|
||||
i += 1
|
||||
if i == len(df[column]):
|
||||
continue
|
||||
if isinstance(df[column].iloc[i], str):
|
||||
if '"program":' in df[column].iloc[i][:20]:
|
||||
df[column] = df[column].transform(lambda x: import_json_string(x, verbose=False) if not pd.isna(x) else None)
|
||||
if auto_gamma is True:
|
||||
if isinstance(df[column].iloc[i], list):
|
||||
df[column].apply(lambda x: [o.gm() if not pd.isna(o) else x for o in x])
|
||||
df[column].apply(lambda x: [o.gm() if o is not None else x for o in x])
|
||||
else:
|
||||
df[column].apply(lambda x: x.gm() if not pd.isna(x) else x)
|
||||
df[column].apply(lambda x: x.gm() if x is not None else x)
|
||||
# Convert NA values back to Python None for compatibility with `x is None` checks
|
||||
if df[column].isna().any():
|
||||
df[column] = df[column].astype(object).where(df[column].notna(), None)
|
||||
|
|
|
|||
|
|
@ -244,6 +244,55 @@ def test_sql_if_exists_fail(tmp_path):
|
|||
pe.input.pandas.to_sql(pe_df, "My_table", my_db, if_exists='replace')
|
||||
|
||||
|
||||
def test_string_column_df_export_import(tmp_path):
|
||||
my_dict = {"str_col": "hello",
|
||||
"Obs1": pe.pseudo_Obs(87, 21, "test_ensemble")}
|
||||
my_df = pd.DataFrame([my_dict] * 4)
|
||||
my_df["str_col"] = my_df["str_col"].astype("string")
|
||||
for gz in [True, False]:
|
||||
pe.input.pandas.dump_df(my_df, (tmp_path / 'df_output').as_posix(), gz=gz)
|
||||
reconstructed_df = pe.input.pandas.load_df((tmp_path / 'df_output').as_posix(), gz=gz)
|
||||
assert np.all(reconstructed_df["Obs1"] == my_df["Obs1"])
|
||||
assert list(reconstructed_df["str_col"]) == list(my_df["str_col"])
|
||||
|
||||
|
||||
def test_string_column_with_none_df_export_import(tmp_path):
|
||||
my_dict = {"str_col": "hello",
|
||||
"Obs1": pe.pseudo_Obs(87, 21, "test_ensemble")}
|
||||
my_df = pd.DataFrame([my_dict] * 4)
|
||||
my_df["str_col"] = my_df["str_col"].astype("string")
|
||||
my_df.loc[0, "str_col"] = None
|
||||
my_df.loc[2, "str_col"] = None
|
||||
for gz in [True, False]:
|
||||
pe.input.pandas.dump_df(my_df, (tmp_path / 'df_output').as_posix(), gz=gz)
|
||||
reconstructed_df = pe.input.pandas.load_df((tmp_path / 'df_output').as_posix(), gz=gz)
|
||||
assert reconstructed_df.loc[0, "str_col"] is None
|
||||
assert reconstructed_df.loc[2, "str_col"] is None
|
||||
assert reconstructed_df.loc[1, "str_col"] == "hello"
|
||||
assert np.all(reconstructed_df["Obs1"] == my_df["Obs1"])
|
||||
|
||||
|
||||
def test_string_column_sql_export_import(tmp_path):
|
||||
my_dict = {"str_col": "hello",
|
||||
"Obs1": pe.pseudo_Obs(87, 21, "test_ensemble")}
|
||||
my_df = pd.DataFrame([my_dict] * 4)
|
||||
my_df["str_col"] = my_df["str_col"].astype("string")
|
||||
my_df.loc[1, "str_col"] = None
|
||||
my_db = (tmp_path / "test_db.sqlite").as_posix()
|
||||
pe.input.pandas.to_sql(my_df, "test", my_db)
|
||||
reconstructed_df = pe.input.pandas.read_sql("SELECT * FROM test", my_db)
|
||||
assert reconstructed_df.loc[1, "str_col"] is None
|
||||
assert reconstructed_df.loc[0, "str_col"] == "hello"
|
||||
assert np.all(reconstructed_df["Obs1"] == my_df["Obs1"])
|
||||
|
||||
|
||||
def test_empty_df_deserialize():
|
||||
empty_df = pd.DataFrame({"str_col": pd.Series(dtype="object"),
|
||||
"int_col": pd.Series(dtype="int64")})
|
||||
result = pe.input.pandas._deserialize_df(empty_df)
|
||||
assert len(result) == 0
|
||||
|
||||
|
||||
def test_Obs_list_sql(tmp_path):
|
||||
my_dict = {"int": 1,
|
||||
"Obs1": pe.pseudo_Obs(17, 11, "test_sql_if_exists_failnsemble"),
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue