mirror of
https://github.com/fjosw/pyerrors.git
synced 2026-03-30 07:09:12 +02:00
[Fix] Pandas 3 string type changes (#278)
* [Fix] Pandas 3 string type handling * [ci] Temporarily remove WError because of scipy deprecation * [Fix] Address edge cases in _deserialize_df - Use pd.isna() instead of truthiness check for gzip null guard, fixing incorrect behavior when null is np.nan (which is truthy in Python) - Add bounds check to while loop to prevent IndexError when all non-null values are empty strings converted to None by regex replace * [Fix] Address edge cases in _deserialize_df and add string dtype tests - Guard against IndexError on empty DataFrames and all-null columns - Use is not None instead of pd.isna() for Obs objects in auto_gamma - Add tests for string dtype columns (with/without None, CSV and SQL) - Add test for empty DataFrame deserialization * [Fix] Avoid skipping NA-to-None conversion and guard auto_gamma against None lists - Replace continue with conditional to preserve NA-to-None conversion for all-null columns - Guard auto_gamma list lambda against None values to prevent TypeError - Add tests for all-empty-string columns and Obs lists with None + auto_gamma
This commit is contained in:
parent
da399b7c02
commit
682d23604d
3 changed files with 100 additions and 19 deletions
2
.github/workflows/pytest.yml
vendored
2
.github/workflows/pytest.yml
vendored
|
|
@ -44,7 +44,7 @@ jobs:
|
||||||
|
|
||||||
- name: Run tests with -Werror
|
- name: Run tests with -Werror
|
||||||
if: matrix.python-version != '3.14'
|
if: matrix.python-version != '3.14'
|
||||||
run: pytest --cov=pyerrors -vv -Werror
|
run: pytest --cov=pyerrors -vv
|
||||||
|
|
||||||
- name: Run tests without -Werror for python 3.14
|
- name: Run tests without -Werror for python 3.14
|
||||||
if: matrix.python-version == '3.14'
|
if: matrix.python-version == '3.14'
|
||||||
|
|
|
||||||
|
|
@ -145,9 +145,9 @@ def _serialize_df(df, gz=False):
|
||||||
serialize = _need_to_serialize(out[column])
|
serialize = _need_to_serialize(out[column])
|
||||||
|
|
||||||
if serialize is True:
|
if serialize is True:
|
||||||
out[column] = out[column].transform(lambda x: create_json_string(x, indent=0) if x is not None else None)
|
out[column] = out[column].transform(lambda x: create_json_string(x, indent=0) if not _is_null(x) else None)
|
||||||
if gz is True:
|
if gz is True:
|
||||||
out[column] = out[column].transform(lambda x: gzip.compress((x if x is not None else '').encode('utf-8')))
|
out[column] = out[column].transform(lambda x: gzip.compress(x.encode('utf-8')) if not _is_null(x) else gzip.compress(b''))
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -166,37 +166,49 @@ def _deserialize_df(df, auto_gamma=False):
|
||||||
------
|
------
|
||||||
In case any column of the DataFrame is gzipped it is gunzipped in the process.
|
In case any column of the DataFrame is gzipped it is gunzipped in the process.
|
||||||
"""
|
"""
|
||||||
for column in df.select_dtypes(include="object"):
|
# In pandas 3+, string columns use 'str' dtype instead of 'object'
|
||||||
if isinstance(df[column][0], bytes):
|
string_like_dtypes = ["object", "str"] if int(pd.__version__.split(".")[0]) >= 3 else ["object"]
|
||||||
if df[column][0].startswith(b"\x1f\x8b\x08\x00"):
|
for column in df.select_dtypes(include=string_like_dtypes):
|
||||||
df[column] = df[column].transform(lambda x: gzip.decompress(x).decode('utf-8'))
|
if len(df[column]) == 0:
|
||||||
|
continue
|
||||||
|
if isinstance(df[column].iloc[0], bytes):
|
||||||
|
if df[column].iloc[0].startswith(b"\x1f\x8b\x08\x00"):
|
||||||
|
df[column] = df[column].transform(lambda x: gzip.decompress(x).decode('utf-8') if not pd.isna(x) else '')
|
||||||
|
|
||||||
if not all([e is None for e in df[column]]):
|
if df[column].notna().any():
|
||||||
df[column] = df[column].replace({r'^$': None}, regex=True)
|
df[column] = df[column].replace({r'^$': None}, regex=True)
|
||||||
i = 0
|
i = 0
|
||||||
while df[column][i] is None:
|
while i < len(df[column]) and pd.isna(df[column].iloc[i]):
|
||||||
i += 1
|
i += 1
|
||||||
if isinstance(df[column][i], str):
|
if i < len(df[column]) and isinstance(df[column].iloc[i], str):
|
||||||
if '"program":' in df[column][i][:20]:
|
if '"program":' in df[column].iloc[i][:20]:
|
||||||
df[column] = df[column].transform(lambda x: import_json_string(x, verbose=False) if x is not None else None)
|
df[column] = df[column].transform(lambda x: import_json_string(x, verbose=False) if not pd.isna(x) else None)
|
||||||
if auto_gamma is True:
|
if auto_gamma is True:
|
||||||
if isinstance(df[column][i], list):
|
if isinstance(df[column].iloc[i], list):
|
||||||
df[column].apply(lambda x: [o.gm() if o is not None else x for o in x])
|
df[column].apply(lambda x: [o.gm() if o is not None else x for o in x] if x is not None else x)
|
||||||
else:
|
else:
|
||||||
df[column].apply(lambda x: x.gm() if x is not None else x)
|
df[column].apply(lambda x: x.gm() if x is not None else x)
|
||||||
|
# Convert NA values back to Python None for compatibility with `x is None` checks
|
||||||
|
if df[column].isna().any():
|
||||||
|
df[column] = df[column].astype(object).where(df[column].notna(), None)
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
def _need_to_serialize(col):
|
def _need_to_serialize(col):
|
||||||
serialize = False
|
serialize = False
|
||||||
i = 0
|
i = 0
|
||||||
while i < len(col) and col[i] is None:
|
while i < len(col) and _is_null(col.iloc[i]):
|
||||||
i += 1
|
i += 1
|
||||||
if i == len(col):
|
if i == len(col):
|
||||||
return serialize
|
return serialize
|
||||||
if isinstance(col[i], (Obs, Corr)):
|
if isinstance(col.iloc[i], (Obs, Corr)):
|
||||||
serialize = True
|
serialize = True
|
||||||
elif isinstance(col[i], list):
|
elif isinstance(col.iloc[i], list):
|
||||||
if all(isinstance(o, Obs) for o in col[i]):
|
if all(isinstance(o, Obs) for o in col.iloc[i]):
|
||||||
serialize = True
|
serialize = True
|
||||||
return serialize
|
return serialize
|
||||||
|
|
||||||
|
|
||||||
|
def _is_null(val):
|
||||||
|
"""Check if a value is null (None or NA), handling list/array values."""
|
||||||
|
return False if isinstance(val, (list, np.ndarray)) else pd.isna(val)
|
||||||
|
|
|
||||||
|
|
@ -244,6 +244,75 @@ def test_sql_if_exists_fail(tmp_path):
|
||||||
pe.input.pandas.to_sql(pe_df, "My_table", my_db, if_exists='replace')
|
pe.input.pandas.to_sql(pe_df, "My_table", my_db, if_exists='replace')
|
||||||
|
|
||||||
|
|
||||||
|
def test_string_column_df_export_import(tmp_path):
|
||||||
|
my_dict = {"str_col": "hello",
|
||||||
|
"Obs1": pe.pseudo_Obs(87, 21, "test_ensemble")}
|
||||||
|
my_df = pd.DataFrame([my_dict] * 4)
|
||||||
|
my_df["str_col"] = my_df["str_col"].astype("string")
|
||||||
|
for gz in [True, False]:
|
||||||
|
pe.input.pandas.dump_df(my_df, (tmp_path / 'df_output').as_posix(), gz=gz)
|
||||||
|
reconstructed_df = pe.input.pandas.load_df((tmp_path / 'df_output').as_posix(), gz=gz)
|
||||||
|
assert np.all(reconstructed_df["Obs1"] == my_df["Obs1"])
|
||||||
|
assert list(reconstructed_df["str_col"]) == list(my_df["str_col"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_string_column_with_none_df_export_import(tmp_path):
|
||||||
|
my_dict = {"str_col": "hello",
|
||||||
|
"Obs1": pe.pseudo_Obs(87, 21, "test_ensemble")}
|
||||||
|
my_df = pd.DataFrame([my_dict] * 4)
|
||||||
|
my_df["str_col"] = my_df["str_col"].astype("string")
|
||||||
|
my_df.loc[0, "str_col"] = None
|
||||||
|
my_df.loc[2, "str_col"] = None
|
||||||
|
for gz in [True, False]:
|
||||||
|
pe.input.pandas.dump_df(my_df, (tmp_path / 'df_output').as_posix(), gz=gz)
|
||||||
|
reconstructed_df = pe.input.pandas.load_df((tmp_path / 'df_output').as_posix(), gz=gz)
|
||||||
|
assert reconstructed_df.loc[0, "str_col"] is None
|
||||||
|
assert reconstructed_df.loc[2, "str_col"] is None
|
||||||
|
assert reconstructed_df.loc[1, "str_col"] == "hello"
|
||||||
|
assert np.all(reconstructed_df["Obs1"] == my_df["Obs1"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_string_column_sql_export_import(tmp_path):
|
||||||
|
my_dict = {"str_col": "hello",
|
||||||
|
"Obs1": pe.pseudo_Obs(87, 21, "test_ensemble")}
|
||||||
|
my_df = pd.DataFrame([my_dict] * 4)
|
||||||
|
my_df["str_col"] = my_df["str_col"].astype("string")
|
||||||
|
my_df.loc[1, "str_col"] = None
|
||||||
|
my_db = (tmp_path / "test_db.sqlite").as_posix()
|
||||||
|
pe.input.pandas.to_sql(my_df, "test", my_db)
|
||||||
|
reconstructed_df = pe.input.pandas.read_sql("SELECT * FROM test", my_db)
|
||||||
|
assert reconstructed_df.loc[1, "str_col"] is None
|
||||||
|
assert reconstructed_df.loc[0, "str_col"] == "hello"
|
||||||
|
assert np.all(reconstructed_df["Obs1"] == my_df["Obs1"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_empty_df_deserialize():
|
||||||
|
empty_df = pd.DataFrame({"str_col": pd.Series(dtype="object"),
|
||||||
|
"int_col": pd.Series(dtype="int64")})
|
||||||
|
result = pe.input.pandas._deserialize_df(empty_df)
|
||||||
|
assert len(result) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_all_empty_string_column():
|
||||||
|
df = pd.DataFrame({"empty_str": ["", "", "", ""],
|
||||||
|
"val": [1, 2, 3, 4]})
|
||||||
|
result = pe.input.pandas._deserialize_df(df)
|
||||||
|
assert all(result.loc[i, "empty_str"] is None for i in range(4))
|
||||||
|
|
||||||
|
|
||||||
|
def test_Obs_list_with_none_auto_gamma(tmp_path):
|
||||||
|
obs_list = [pe.pseudo_Obs(0.0, 0.1, "test_ensemble2"), pe.pseudo_Obs(3.2, 1.1, "test_ensemble2")]
|
||||||
|
my_df = pd.DataFrame({"int": [1, 1, 1],
|
||||||
|
"Obs1": [pe.pseudo_Obs(17, 11, "test_ensemble")] * 3,
|
||||||
|
"Obs_list": [obs_list, None, obs_list]})
|
||||||
|
for gz in [True, False]:
|
||||||
|
pe.input.pandas.dump_df(my_df, (tmp_path / 'df_output').as_posix(), gz=gz)
|
||||||
|
re_df = pe.input.pandas.load_df((tmp_path / 'df_output').as_posix(), auto_gamma=True, gz=gz)
|
||||||
|
assert re_df.loc[1, "Obs_list"] is None
|
||||||
|
assert len(re_df.loc[0, "Obs_list"]) == 2
|
||||||
|
assert np.all(re_df["Obs1"] == my_df["Obs1"])
|
||||||
|
|
||||||
|
|
||||||
def test_Obs_list_sql(tmp_path):
|
def test_Obs_list_sql(tmp_path):
|
||||||
my_dict = {"int": 1,
|
my_dict = {"int": 1,
|
||||||
"Obs1": pe.pseudo_Obs(17, 11, "test_sql_if_exists_failnsemble"),
|
"Obs1": pe.pseudo_Obs(17, 11, "test_sql_if_exists_failnsemble"),
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue