[Fix] Address edge cases in _deserialize_df

- Use pd.isna() instead of truthiness check for gzip null guard, fixing
  incorrect behavior when null is np.nan (which is truthy in Python)
- Add bounds check to while loop to prevent IndexError when all non-null
  values are empty strings converted to None by regex replace
This commit is contained in:
Fabian Joswig 2026-02-19 16:11:47 +01:00
commit 69a33c80c1

View file

@ -171,12 +171,12 @@ def _deserialize_df(df, auto_gamma=False):
for column in df.select_dtypes(include=string_like_dtypes):
if isinstance(df[column].iloc[0], bytes):
if df[column].iloc[0].startswith(b"\x1f\x8b\x08\x00"):
df[column] = df[column].transform(lambda x: gzip.decompress(x).decode('utf-8') if x else '')
df[column] = df[column].transform(lambda x: gzip.decompress(x).decode('utf-8') if not pd.isna(x) else '')
if df[column].notna().any():
df[column] = df[column].replace({r'^$': None}, regex=True)
i = 0
while pd.isna(df[column].iloc[i]):
while i < len(df[column]) and pd.isna(df[column].iloc[i]):
i += 1
if isinstance(df[column].iloc[i], str):
if '"program":' in df[column].iloc[i][:20]: