[Fix] Pandas 3 string type handling

This commit is contained in:
Fabian Joswig 2026-02-03 15:39:43 +01:00
commit 9b01f8007b

View file

@ -145,9 +145,9 @@ def _serialize_df(df, gz=False):
serialize = _need_to_serialize(out[column])
if serialize is True:
out[column] = out[column].transform(lambda x: create_json_string(x, indent=0) if x is not None else None)
out[column] = out[column].transform(lambda x: create_json_string(x, indent=0) if not _is_null(x) else None)
if gz is True:
out[column] = out[column].transform(lambda x: gzip.compress((x if x is not None else '').encode('utf-8')))
out[column] = out[column].transform(lambda x: gzip.compress(x.encode('utf-8')) if not _is_null(x) else gzip.compress(b''))
return out
@ -166,37 +166,47 @@ def _deserialize_df(df, auto_gamma=False):
------
In case any column of the DataFrame is gzipped it is gunzipped in the process.
"""
for column in df.select_dtypes(include="object"):
if isinstance(df[column][0], bytes):
if df[column][0].startswith(b"\x1f\x8b\x08\x00"):
df[column] = df[column].transform(lambda x: gzip.decompress(x).decode('utf-8'))
# In pandas 3+, string columns use 'str' dtype instead of 'object'
string_like_dtypes = ["object", "str"] if int(pd.__version__.split(".")[0]) >= 3 else ["object"]
for column in df.select_dtypes(include=string_like_dtypes):
if isinstance(df[column].iloc[0], bytes):
if df[column].iloc[0].startswith(b"\x1f\x8b\x08\x00"):
df[column] = df[column].transform(lambda x: gzip.decompress(x).decode('utf-8') if x else '')
if not all([e is None for e in df[column]]):
if df[column].notna().any():
df[column] = df[column].replace({r'^$': None}, regex=True)
i = 0
while df[column][i] is None:
while pd.isna(df[column].iloc[i]):
i += 1
if isinstance(df[column][i], str):
if '"program":' in df[column][i][:20]:
df[column] = df[column].transform(lambda x: import_json_string(x, verbose=False) if x is not None else None)
if isinstance(df[column].iloc[i], str):
if '"program":' in df[column].iloc[i][:20]:
df[column] = df[column].transform(lambda x: import_json_string(x, verbose=False) if not pd.isna(x) else None)
if auto_gamma is True:
if isinstance(df[column][i], list):
df[column].apply(lambda x: [o.gm() if o is not None else x for o in x])
if isinstance(df[column].iloc[i], list):
df[column].apply(lambda x: [o.gm() if not pd.isna(o) else x for o in x])
else:
df[column].apply(lambda x: x.gm() if x is not None else x)
df[column].apply(lambda x: x.gm() if not pd.isna(x) else x)
# Convert NA values back to Python None for compatibility with `x is None` checks
if df[column].isna().any():
df[column] = df[column].astype(object).where(df[column].notna(), None)
return df
def _need_to_serialize(col):
serialize = False
i = 0
while i < len(col) and col[i] is None:
while i < len(col) and _is_null(col.iloc[i]):
i += 1
if i == len(col):
return serialize
if isinstance(col[i], (Obs, Corr)):
if isinstance(col.iloc[i], (Obs, Corr)):
serialize = True
elif isinstance(col[i], list):
if all(isinstance(o, Obs) for o in col[i]):
elif isinstance(col.iloc[i], list):
if all(isinstance(o, Obs) for o in col.iloc[i]):
serialize = True
return serialize
def _is_null(val):
"""Check if a value is null (None or NA), handling list/array values."""
return False if isinstance(val, (list, np.ndarray)) else pd.isna(val)