From cc9f47c6860b9c31127921975d6c69da38e41733 Mon Sep 17 00:00:00 2001 From: Fabian Joswig Date: Mon, 4 Jul 2022 14:19:30 +0100 Subject: [PATCH] feat: pandas DataFrame serialization and deserialization can now also deal with gzipped json columns. Tests added. --- pyerrors/input/pandas.py | 17 ++++++++++++++--- tests/pandas_test.py | 9 +++++++++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/pyerrors/input/pandas.py b/pyerrors/input/pandas.py index 699392a0..24296781 100644 --- a/pyerrors/input/pandas.py +++ b/pyerrors/input/pandas.py @@ -22,7 +22,7 @@ def dump_df(df, fname, gz=True): If True, the output is a gzipped csv file. If False, the output is a csv file. """ - out = serialize_df(df) + out = serialize_df(df, gz=False) if not fname.endswith('.csv'): fname += '.csv' @@ -62,21 +62,25 @@ def load_df(fname, auto_gamma=False, gz=True): warnings.warn("Trying to read from %s without unzipping!" % fname, UserWarning) re_import = pd.read_csv(fname) - return deserialize_df(re_import, auto_gamma) + return deserialize_df(re_import, auto_gamma=auto_gamma) -def serialize_df(df): +def serialize_df(df, gz=False): """Serializes all Obs or Corr valued columns into json strings according to the pyerrors json specification. Parameters ---------- df : pandas.DataFrame DataFrame to be serilized. + gz: bool + gzip the json string represenation. Default False. """ out = df.copy() for column in out: if isinstance(out[column][0], (Obs, Corr)): out[column] = out[column].transform(lambda x: create_json_string(x, indent=0)) + if gz is True: + out[column] = out[column].transform(lambda x: gzip.compress(x.encode('utf-8'))) return out @@ -90,8 +94,15 @@ def deserialize_df(df, auto_gamma=False): auto_gamma : bool If True applies the gamma_method to all imported Obs objects with the default parameters for the error analysis. Default False. + + Notes: + ------ + In case any column of the DataFrame is gzipped it is gunzipped in the process. """ for column in df.select_dtypes(include="object"): + if isinstance(df[column][0], bytes): + if df[column][0].startswith(b"\x1f\x8b\x08\x00"): + df[column] = df[column].transform(lambda x: gzip.decompress(x).decode('utf-8')) if isinstance(df[column][0], str): if df[column][0][:20] == '{"program":"pyerrors': df[column] = df[column].transform(lambda x: import_json_string(x, verbose=False)) diff --git a/tests/pandas_test.py b/tests/pandas_test.py index 976a4bf1..dec98456 100644 --- a/tests/pandas_test.py +++ b/tests/pandas_test.py @@ -42,3 +42,12 @@ def test_pe_export_default_import(tmp_path): pe.input.pandas.dump_df(df, (tmp_path / 'pe_df').as_posix(), gz=False) re_df = pd.read_csv((tmp_path / 'pe_df.csv').as_posix()) assert np.all(df == re_df) + + +def test_gz_serialization(): + my_obs = pe.pseudo_Obs(0.1, 0.01, "pandas DataFrame ensemble only for test purposes.") + my_df = pd.DataFrame([{"Label": 1, "Obs": my_obs}]) + for gz in [False, True]: + ser = pe.input.pandas.serialize_df(my_df, gz=gz) + deser = pe.input.pandas.deserialize_df(ser) + np.all(my_df == deser)