From cc9f47c6860b9c31127921975d6c69da38e41733 Mon Sep 17 00:00:00 2001
From: Fabian Joswig <fabian.joswig@ed.ac.uk>
Date: Mon, 4 Jul 2022 14:19:30 +0100
Subject: [PATCH] feat: pandas DataFrame serialization and deserialization can
 now also deal with gzipped json columns. Tests added.

---
 pyerrors/input/pandas.py | 17 ++++++++++++++---
 tests/pandas_test.py     |  9 +++++++++
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/pyerrors/input/pandas.py b/pyerrors/input/pandas.py
index 699392a0..24296781 100644
--- a/pyerrors/input/pandas.py
+++ b/pyerrors/input/pandas.py
@@ -22,7 +22,7 @@ def dump_df(df, fname, gz=True):
         If True, the output is a gzipped csv file. If False, the output is a csv file.
     """
 
-    out = serialize_df(df)
+    out = serialize_df(df, gz=False)
 
     if not fname.endswith('.csv'):
         fname += '.csv'
@@ -62,21 +62,25 @@ def load_df(fname, auto_gamma=False, gz=True):
             warnings.warn("Trying to read from %s without unzipping!" % fname, UserWarning)
         re_import = pd.read_csv(fname)
 
-    return deserialize_df(re_import, auto_gamma)
+    return deserialize_df(re_import, auto_gamma=auto_gamma)
 
 
-def serialize_df(df):
+def serialize_df(df, gz=False):
     """Serializes all Obs or Corr valued columns into json strings according to the pyerrors json specification.
 
     Parameters
     ----------
     df : pandas.DataFrame
         DataFrame to be serilized.
+    gz: bool
+        gzip the json string represenation. Default False.
     """
     out = df.copy()
     for column in out:
         if isinstance(out[column][0], (Obs, Corr)):
             out[column] = out[column].transform(lambda x: create_json_string(x, indent=0))
+            if gz is True:
+                out[column] = out[column].transform(lambda x: gzip.compress(x.encode('utf-8')))
     return out
 
 
@@ -90,8 +94,15 @@ def deserialize_df(df, auto_gamma=False):
     auto_gamma : bool
         If True applies the gamma_method to all imported Obs objects with the default parameters for
         the error analysis. Default False.
+
+    Notes:
+    ------
+    In case any column of the DataFrame is gzipped it is gunzipped in the process.
     """
     for column in df.select_dtypes(include="object"):
+        if isinstance(df[column][0], bytes):
+            if df[column][0].startswith(b"\x1f\x8b\x08\x00"):
+                df[column] = df[column].transform(lambda x: gzip.decompress(x).decode('utf-8'))
         if isinstance(df[column][0], str):
             if df[column][0][:20] == '{"program":"pyerrors':
                 df[column] = df[column].transform(lambda x: import_json_string(x, verbose=False))
diff --git a/tests/pandas_test.py b/tests/pandas_test.py
index 976a4bf1..dec98456 100644
--- a/tests/pandas_test.py
+++ b/tests/pandas_test.py
@@ -42,3 +42,12 @@ def test_pe_export_default_import(tmp_path):
     pe.input.pandas.dump_df(df, (tmp_path / 'pe_df').as_posix(), gz=False)
     re_df = pd.read_csv((tmp_path / 'pe_df.csv').as_posix())
     assert np.all(df == re_df)
+
+
+def test_gz_serialization():
+    my_obs = pe.pseudo_Obs(0.1, 0.01, "pandas DataFrame ensemble only for test purposes.")
+    my_df = pd.DataFrame([{"Label": 1, "Obs": my_obs}])
+    for gz in [False, True]:
+        ser = pe.input.pandas.serialize_df(my_df, gz=gz)
+        deser = pe.input.pandas.deserialize_df(ser)
+        np.all(my_df == deser)