feat: pandas DataFrame serialization and deserialization can now also

deal with gzipped json columns. Tests added.
This commit is contained in:
Fabian Joswig 2022-07-04 14:19:30 +01:00
parent a133030114
commit cc9f47c686
2 changed files with 23 additions and 3 deletions

View file

@ -22,7 +22,7 @@ def dump_df(df, fname, gz=True):
If True, the output is a gzipped csv file. If False, the output is a csv file. If True, the output is a gzipped csv file. If False, the output is a csv file.
""" """
out = serialize_df(df) out = serialize_df(df, gz=False)
if not fname.endswith('.csv'): if not fname.endswith('.csv'):
fname += '.csv' fname += '.csv'
@ -62,21 +62,25 @@ def load_df(fname, auto_gamma=False, gz=True):
warnings.warn("Trying to read from %s without unzipping!" % fname, UserWarning) warnings.warn("Trying to read from %s without unzipping!" % fname, UserWarning)
re_import = pd.read_csv(fname) re_import = pd.read_csv(fname)
return deserialize_df(re_import, auto_gamma) return deserialize_df(re_import, auto_gamma=auto_gamma)
def serialize_df(df): def serialize_df(df, gz=False):
"""Serializes all Obs or Corr valued columns into json strings according to the pyerrors json specification. """Serializes all Obs or Corr valued columns into json strings according to the pyerrors json specification.
Parameters Parameters
---------- ----------
df : pandas.DataFrame df : pandas.DataFrame
DataFrame to be serilized. DataFrame to be serilized.
gz: bool
gzip the json string represenation. Default False.
""" """
out = df.copy() out = df.copy()
for column in out: for column in out:
if isinstance(out[column][0], (Obs, Corr)): if isinstance(out[column][0], (Obs, Corr)):
out[column] = out[column].transform(lambda x: create_json_string(x, indent=0)) out[column] = out[column].transform(lambda x: create_json_string(x, indent=0))
if gz is True:
out[column] = out[column].transform(lambda x: gzip.compress(x.encode('utf-8')))
return out return out
@ -90,8 +94,15 @@ def deserialize_df(df, auto_gamma=False):
auto_gamma : bool auto_gamma : bool
If True applies the gamma_method to all imported Obs objects with the default parameters for If True applies the gamma_method to all imported Obs objects with the default parameters for
the error analysis. Default False. the error analysis. Default False.
Notes:
------
In case any column of the DataFrame is gzipped it is gunzipped in the process.
""" """
for column in df.select_dtypes(include="object"): for column in df.select_dtypes(include="object"):
if isinstance(df[column][0], bytes):
if df[column][0].startswith(b"\x1f\x8b\x08\x00"):
df[column] = df[column].transform(lambda x: gzip.decompress(x).decode('utf-8'))
if isinstance(df[column][0], str): if isinstance(df[column][0], str):
if df[column][0][:20] == '{"program":"pyerrors': if df[column][0][:20] == '{"program":"pyerrors':
df[column] = df[column].transform(lambda x: import_json_string(x, verbose=False)) df[column] = df[column].transform(lambda x: import_json_string(x, verbose=False))

View file

@ -42,3 +42,12 @@ def test_pe_export_default_import(tmp_path):
pe.input.pandas.dump_df(df, (tmp_path / 'pe_df').as_posix(), gz=False) pe.input.pandas.dump_df(df, (tmp_path / 'pe_df').as_posix(), gz=False)
re_df = pd.read_csv((tmp_path / 'pe_df.csv').as_posix()) re_df = pd.read_csv((tmp_path / 'pe_df.csv').as_posix())
assert np.all(df == re_df) assert np.all(df == re_df)
def test_gz_serialization():
my_obs = pe.pseudo_Obs(0.1, 0.01, "pandas DataFrame ensemble only for test purposes.")
my_df = pd.DataFrame([{"Label": 1, "Obs": my_obs}])
for gz in [False, True]:
ser = pe.input.pandas.serialize_df(my_df, gz=gz)
deser = pe.input.pandas.deserialize_df(ser)
np.all(my_df == deser)