[Fix] Pandas 3 string type handling

2026-04-15 01:01:38 +02:00 · 2026-02-03 15:39:43 +01:00 · 2026-02-03 15:39:43 +01:00 · 9b01f8007b
commit 9b01f8007b
parent da399b7c02
1 changed files with 29 additions and 19 deletions
--- a/pyerrors/input/pandas.py
+++ b/pyerrors/input/pandas.py
@ -145,9 +145,9 @@ def _serialize_df(df, gz=False):
        serialize = _need_to_serialize(out[column])

        if serialize is True:
-            out[column] = out[column].transform(lambda x: create_json_string(x, indent=0) if x is not None else None)
+            out[column] = out[column].transform(lambda x: create_json_string(x, indent=0) if not _is_null(x) else None)
            if gz is True:
-                out[column] = out[column].transform(lambda x: gzip.compress((x if x is not None else '').encode('utf-8')))
+                out[column] = out[column].transform(lambda x: gzip.compress(x.encode('utf-8')) if not _is_null(x) else gzip.compress(b''))
    return out


@ -166,37 +166,47 @@ def _deserialize_df(df, auto_gamma=False):
    ------
    In case any column of the DataFrame is gzipped it is gunzipped in the process.
    """
-    for column in df.select_dtypes(include="object"):
-        if isinstance(df[column][0], bytes):
-            if df[column][0].startswith(b"\x1f\x8b\x08\x00"):
-                df[column] = df[column].transform(lambda x: gzip.decompress(x).decode('utf-8'))
+    # In pandas 3+, string columns use 'str' dtype instead of 'object'
+    string_like_dtypes = ["object", "str"] if int(pd.__version__.split(".")[0]) >= 3 else ["object"]
+    for column in df.select_dtypes(include=string_like_dtypes):
+        if isinstance(df[column].iloc[0], bytes):
+            if df[column].iloc[0].startswith(b"\x1f\x8b\x08\x00"):
+                df[column] = df[column].transform(lambda x: gzip.decompress(x).decode('utf-8') if x else '')

-        if not all([e is None for e in df[column]]):
+        if df[column].notna().any():
            df[column] = df[column].replace({r'^$': None}, regex=True)
            i = 0
-            while df[column][i] is None:
+            while pd.isna(df[column].iloc[i]):
                i += 1
-            if isinstance(df[column][i], str):
-                if '"program":' in df[column][i][:20]:
-                    df[column] = df[column].transform(lambda x: import_json_string(x, verbose=False) if x is not None else None)
+            if isinstance(df[column].iloc[i], str):
+                if '"program":' in df[column].iloc[i][:20]:
+                    df[column] = df[column].transform(lambda x: import_json_string(x, verbose=False) if not pd.isna(x) else None)
                    if auto_gamma is True:
-                        if isinstance(df[column][i], list):
-                            df[column].apply(lambda x: [o.gm() if o is not None else x for o in x])
+                        if isinstance(df[column].iloc[i], list):
+                            df[column].apply(lambda x: [o.gm() if not pd.isna(o) else x for o in x])
                        else:
-                            df[column].apply(lambda x: x.gm() if x is not None else x)
+                            df[column].apply(lambda x: x.gm() if not pd.isna(x) else x)
+        # Convert NA values back to Python None for compatibility with `x is None` checks
+        if df[column].isna().any():
+            df[column] = df[column].astype(object).where(df[column].notna(), None)
    return df


 def _need_to_serialize(col):
    serialize = False
    i = 0
-    while i < len(col) and col[i] is None:
+    while i < len(col) and _is_null(col.iloc[i]):
        i += 1
    if i == len(col):
        return serialize
-    if isinstance(col[i], (Obs, Corr)):
+    if isinstance(col.iloc[i], (Obs, Corr)):
        serialize = True
-    elif isinstance(col[i], list):
-        if all(isinstance(o, Obs) for o in col[i]):
+    elif isinstance(col.iloc[i], list):
+        if all(isinstance(o, Obs) for o in col.iloc[i]):
            serialize = True
    return serialize
+
+
+def _is_null(val):
+    """Check if a value is null (None or NA), handling list/array values."""
+    return False if isinstance(val, (list, np.ndarray)) else pd.isna(val)