From a5b6f6916029fc545ae6b44733499c1325cce102 Mon Sep 17 00:00:00 2001
From: Justus Kuhlmann <82444481+jkuhl-uni@users.noreply.github.com>
Date: Thu, 18 May 2023 18:11:52 +0200
Subject: [PATCH] Slightly better Typechecking when exporting to SQL (#174)

* corret type clause

* add tests, changes in create_json_string

* create json-string now gives back None

* revert changes

* fix panda sql export

* add SQL test

* fixed None type export for csv and sql.gz

* move None parsing to json io

* alter regex

* revert changes

* only replace None with empty str when necessary

* fixed deserialze_df for python 3.7

* add more tesets

* fix case where gz was ignored

* hand voer gz explicitly

* replace nan  by None in non-Obs columns

* moved warning to csv export, mroe tests

* only values able to be nan are put in np.isnan()

* added python float for warning
---
 pyerrors/input/json.py   |   1 -
 pyerrors/input/pandas.py |  52 ++++++++----
 tests/pandas_test.py     | 169 +++++++++++++++++++++++++++++++++++++--
 3 files changed, 198 insertions(+), 24 deletions(-)

diff --git a/pyerrors/input/json.py b/pyerrors/input/json.py
index 0ab23bae..50470bff 100644
--- a/pyerrors/input/json.py
+++ b/pyerrors/input/json.py
@@ -479,7 +479,6 @@ def import_json_string(json_string, verbose=True, full_output=False):
     result : dict
         if full_output=True
     """
-
     return _parse_json_dict(json.loads(json_string), verbose, full_output)
 
 
diff --git a/pyerrors/input/pandas.py b/pyerrors/input/pandas.py
index fdec1602..911c14d5 100644
--- a/pyerrors/input/pandas.py
+++ b/pyerrors/input/pandas.py
@@ -5,6 +5,7 @@ import pandas as pd
 from ..obs import Obs
 from ..correlators import Corr
 from .json import create_json_string, import_json_string
+import numpy as np
 
 
 def to_sql(df, table_name, db, if_exists='fail', gz=True, **kwargs):
@@ -76,6 +77,13 @@ def dump_df(df, fname, gz=True):
     -------
     None
     """
+    for column in df:
+        serialize = _need_to_serialize(df[column])
+        if not serialize:
+            if all(isinstance(entry, (int, np.integer, float, np.floating)) for entry in df[column]):
+                if any([np.isnan(entry) for entry in df[column]]):
+                    warnings.warn("nan value in column " + column + " will be replaced by None", UserWarning)
+
     out = _serialize_df(df, gz=False)
 
     if not fname.endswith('.csv'):
@@ -114,11 +122,11 @@ def load_df(fname, auto_gamma=False, gz=True):
         if not fname.endswith('.gz'):
             fname += '.gz'
         with gzip.open(fname) as f:
-            re_import = pd.read_csv(f)
+            re_import = pd.read_csv(f, keep_default_na=False)
     else:
         if fname.endswith('.gz'):
             warnings.warn("Trying to read from %s without unzipping!" % fname, UserWarning)
-        re_import = pd.read_csv(fname)
+        re_import = pd.read_csv(fname, keep_default_na=False)
 
     return _deserialize_df(re_import, auto_gamma=auto_gamma)
 
@@ -135,17 +143,12 @@ def _serialize_df(df, gz=False):
     """
     out = df.copy()
     for column in out:
-        serialize = False
-        if isinstance(out[column][0], (Obs, Corr)):
-            serialize = True
-        elif isinstance(out[column][0], list):
-            if all(isinstance(o, Obs) for o in out[column][0]):
-                serialize = True
+        serialize = _need_to_serialize(out[column])
 
         if serialize is True:
-            out[column] = out[column].transform(lambda x: create_json_string(x, indent=0))
+            out[column] = out[column].transform(lambda x: create_json_string(x, indent=0) if x is not None else None)
             if gz is True:
-                out[column] = out[column].transform(lambda x: gzip.compress(x.encode('utf-8')))
+                out[column] = out[column].transform(lambda x: gzip.compress((x if x is not None else '').encode('utf-8')))
     return out
 
 
@@ -168,12 +171,29 @@ def _deserialize_df(df, auto_gamma=False):
         if isinstance(df[column][0], bytes):
             if df[column][0].startswith(b"\x1f\x8b\x08\x00"):
                 df[column] = df[column].transform(lambda x: gzip.decompress(x).decode('utf-8'))
-        if isinstance(df[column][0], str):
-            if '"program":' in df[column][0][:20]:
-                df[column] = df[column].transform(lambda x: import_json_string(x, verbose=False))
+        df = df.replace({r'^$': None}, regex=True)
+        i = 0
+        while df[column][i] is None:
+            i += 1
+        if isinstance(df[column][i], str):
+            if '"program":' in df[column][i][:20]:
+                df[column] = df[column].transform(lambda x: import_json_string(x, verbose=False) if x is not None else None)
                 if auto_gamma is True:
-                    if isinstance(df[column][0], list):
-                        df[column].apply(lambda x: [o.gm() for o in x])
+                    if isinstance(df[column][i], list):
+                        df[column].apply(lambda x: [o.gm() if o is not None else x for o in x])
                     else:
-                        df[column].apply(lambda x: x.gamma_method())
+                        df[column].apply(lambda x: x.gm() if x is not None else x)
     return df
+
+
+def _need_to_serialize(col):
+    serialize = False
+    i = 0
+    while col[i] is None:
+        i += 1
+    if isinstance(col[i], (Obs, Corr)):
+        serialize = True
+    elif isinstance(col[i], list):
+        if all(isinstance(o, Obs) for o in col[i]):
+            serialize = True
+    return serialize
diff --git a/tests/pandas_test.py b/tests/pandas_test.py
index 81e087b0..fdff4f10 100644
--- a/tests/pandas_test.py
+++ b/tests/pandas_test.py
@@ -2,12 +2,14 @@ import numpy as np
 import pandas as pd
 import pyerrors as pe
 import pytest
+import warnings
+
 
 def test_df_export_import(tmp_path):
     my_dict = {"int": 1,
-           "float": -0.01,
-           "Obs1": pe.pseudo_Obs(87, 21, "test_ensemble"),
-           "Obs2": pe.pseudo_Obs(-87, 21, "test_ensemble2")}
+               "float": -0.01,
+               "Obs1": pe.pseudo_Obs(87, 21, "test_ensemble"),
+               "Obs2": pe.pseudo_Obs(-87, 21, "test_ensemble2")}
     for gz in [True, False]:
         my_df = pd.DataFrame([my_dict] * 10)
 
@@ -18,13 +20,166 @@ def test_df_export_import(tmp_path):
         pe.input.pandas.load_df((tmp_path / 'df_output.csv').as_posix(), gz=gz)
 
 
+def test_null_first_line_df_export_import(tmp_path):
+    my_dict = {"int": 1,
+               "float": -0.01,
+               "Obs1": pe.pseudo_Obs(87, 21, "test_ensemble"),
+               "Obs2": pe.pseudo_Obs(-87, 21, "test_ensemble2")}
+    my_df = pd.DataFrame([my_dict] * 4)
+    my_df.loc[0, "Obs1"] = None
+    my_df.loc[2, "Obs1"] = None
+    for gz in [True, False]:
+        pe.input.pandas.dump_df(my_df, (tmp_path / 'df_output').as_posix(), gz=gz)
+        reconstructed_df = pe.input.pandas.load_df((tmp_path / 'df_output').as_posix(), auto_gamma=True, gz=gz)
+        assert reconstructed_df.loc[0, "Obs1"] is None
+        assert reconstructed_df.loc[2, "Obs1"] is None
+        assert np.all(reconstructed_df.loc[1] == my_df.loc[1])
+        assert np.all(reconstructed_df.loc[3] == my_df.loc[3])
+
+
+def test_nan_df_export_import(tmp_path):
+    my_dict = {"int": 1,
+               "float": -0.01,
+               "Obs1": pe.pseudo_Obs(87, 21, "test_ensemble"),
+               "Obs2": pe.pseudo_Obs(-87, 21, "test_ensemble2")}
+    my_df = pd.DataFrame([my_dict] * 4)
+    my_df.loc[1, "int"] = np.nan
+
+    for gz in [True, False]:
+        pe.input.pandas.dump_df(my_df, (tmp_path / 'df_output').as_posix(), gz=gz)
+        reconstructed_df = pe.input.pandas.load_df((tmp_path / 'df_output').as_posix(), auto_gamma=True, gz=gz)
+        with pytest.warns(UserWarning, match="nan value in column int will be replaced by None"):
+            warnings.warn("nan value in column int will be replaced by None", UserWarning)
+        assert reconstructed_df.loc[1, "int"] is None
+        assert np.all(reconstructed_df.loc[:, "float"] == my_df.loc[:, "float"])
+        assert np.all(reconstructed_df.loc[:, "Obs1"] == my_df.loc[:, "Obs1"])
+        assert np.all(reconstructed_df.loc[:, "Obs2"] == my_df.loc[:, "Obs2"])
+
+
+def test_null_second_line_df_export_import(tmp_path):
+    my_dict = {"int": 1,
+               "float": -0.01,
+               "Obs1": pe.pseudo_Obs(87, 21, "test_ensemble"),
+               "Obs2": pe.pseudo_Obs(-87, 21, "test_ensemble2")}
+    my_df = pd.DataFrame([my_dict] * 4)
+    my_df.loc[1, "Obs1"] = None
+    for gz in [True, False]:
+        pe.input.pandas.dump_df(my_df, (tmp_path / 'df_output').as_posix(), gz=gz)
+        reconstructed_df = pe.input.pandas.load_df((tmp_path / 'df_output').as_posix(), auto_gamma=True, gz=gz)
+        assert reconstructed_df.loc[1, "Obs1"] is None
+        assert np.all(reconstructed_df.loc[0] == my_df.loc[0])
+        assert np.all(reconstructed_df.loc[2:] == my_df.loc[2:])
+
+
+def test_null_first_line_df_gzsql_export_import(tmp_path):
+    my_dict = {"int": 1,
+               "float": -0.01,
+               "Obs1": pe.pseudo_Obs(87, 21, "test_ensemble"),
+               "Obs2": pe.pseudo_Obs(-87, 21, "test_ensemble2")}
+
+    my_df = pd.DataFrame([my_dict] * 4)
+    my_df.loc[0, "Obs1"] = None
+    my_df.loc[2, "Obs1"] = None
+    gz = True
+    pe.input.pandas.to_sql(my_df, 'test', (tmp_path / 'test.db').as_posix(), gz=gz)
+    reconstructed_df = pe.input.pandas.read_sql('SELECT * FROM test', (tmp_path / 'test.db').as_posix(), auto_gamma=True)
+    assert reconstructed_df.loc[0, "Obs1"] is None
+    assert reconstructed_df.loc[2, "Obs1"] is None
+    assert np.all(reconstructed_df.loc[1] == my_df.loc[1])
+    assert np.all(reconstructed_df.loc[3] == my_df.loc[3])
+
+
+def test_null_second_line_df_gzsql_export_import(tmp_path):
+    my_dict = {"int": 1,
+               "float": -0.01,
+               "Obs1": pe.pseudo_Obs(87, 21, "test_ensemble"),
+               "Obs2": pe.pseudo_Obs(-87, 21, "test_ensemble2")}
+
+    my_df = pd.DataFrame([my_dict] * 4)
+    my_df.loc[1, "Obs1"] = None
+    gz = True
+    pe.input.pandas.to_sql(my_df, 'test', (tmp_path / 'test.db').as_posix(), gz=gz)
+    reconstructed_df = pe.input.pandas.read_sql('SELECT * FROM test', (tmp_path / 'test.db').as_posix(), auto_gamma=True)
+    assert reconstructed_df.loc[1, "Obs1"] is None
+    assert np.all(reconstructed_df.loc[0] == my_df.loc[0])
+    assert np.all(reconstructed_df.loc[2:] == my_df.loc[2:])
+
+
+def test_null_first_line_df_sql_export_import(tmp_path):
+    my_dict = {"int": 1,
+               "float": -0.01,
+               "Obs1": pe.pseudo_Obs(87, 21, "test_ensemble"),
+               "Obs2": pe.pseudo_Obs(-87, 21, "test_ensemble2")}
+
+    my_df = pd.DataFrame([my_dict] * 4)
+    my_df.loc[0, "Obs1"] = None
+    my_df.loc[2, "Obs1"] = None
+    gz = False
+    pe.input.pandas.to_sql(my_df, 'test', (tmp_path / 'test.db').as_posix(), gz=gz)
+    reconstructed_df = pe.input.pandas.read_sql('SELECT * FROM test', (tmp_path / 'test.db').as_posix(), auto_gamma=True)
+    assert reconstructed_df.loc[0, "Obs1"] is None
+    assert reconstructed_df.loc[2, "Obs1"] is None
+    assert np.all(reconstructed_df.loc[1] == my_df.loc[1])
+    assert np.all(reconstructed_df.loc[3] == my_df.loc[3])
+
+
+def test_nan_sql_export_import(tmp_path):
+    my_dict = {"int": 1,
+               "float": -0.01,
+               "Obs1": pe.pseudo_Obs(87, 21, "test_ensemble"),
+               "Obs2": pe.pseudo_Obs(-87, 21, "test_ensemble2")}
+    my_df = pd.DataFrame([my_dict] * 4)
+    my_df.loc[1, "int"] = np.nan
+    gz = False
+    pe.input.pandas.to_sql(my_df, 'test', (tmp_path / 'test.db').as_posix(), gz=gz)
+    reconstructed_df = pe.input.pandas.read_sql('SELECT * FROM test', (tmp_path / 'test.db').as_posix(), auto_gamma=True)
+    with pytest.warns(UserWarning, match="nan value in column int will be replaced by None"):
+        warnings.warn("nan value in column int will be replaced by None", UserWarning)
+    assert np.isnan(reconstructed_df.loc[1, "int"])
+    assert np.all(reconstructed_df.loc[:, "float"] == my_df.loc[:, "float"])
+    assert np.all(reconstructed_df.loc[:, "Obs1"] == my_df.loc[:, "Obs1"])
+    assert np.all(reconstructed_df.loc[:, "Obs2"] == my_df.loc[:, "Obs2"])
+
+
+def test_nan_gzsql_export_import(tmp_path):
+    my_dict = {"int": 1,
+               "float": -0.01,
+               "Obs1": pe.pseudo_Obs(87, 21, "test_ensemble"),
+               "Obs2": pe.pseudo_Obs(-87, 21, "test_ensemble2")}
+    my_df = pd.DataFrame([my_dict] * 4)
+    my_df.loc[1, "int"] = np.nan
+    gz = True
+    pe.input.pandas.to_sql(my_df, 'test', (tmp_path / 'test.db').as_posix(), gz=gz)
+    reconstructed_df = pe.input.pandas.read_sql('SELECT * FROM test', (tmp_path / 'test.db').as_posix(), auto_gamma=True)
+    assert np.isnan(reconstructed_df.loc[1, "int"])
+    assert np.all(reconstructed_df.loc[:, "float"] == my_df.loc[:, "float"])
+    assert np.all(reconstructed_df.loc[:, "Obs1"] == my_df.loc[:, "Obs1"])
+    assert np.all(reconstructed_df.loc[:, "Obs2"] == my_df.loc[:, "Obs2"])
+
+
+def test_null_second_line_df_sql_export_import(tmp_path):
+    my_dict = {"int": 1,
+               "float": -0.01,
+               "Obs1": pe.pseudo_Obs(87, 21, "test_ensemble"),
+               "Obs2": pe.pseudo_Obs(-87, 21, "test_ensemble2")}
+
+    my_df = pd.DataFrame([my_dict] * 4)
+    my_df.loc[1, "Obs1"] = None
+    gz = False
+    pe.input.pandas.to_sql(my_df, 'test', (tmp_path / 'test.db').as_posix(), gz=gz)
+    reconstructed_df = pe.input.pandas.read_sql('SELECT * FROM test', (tmp_path / 'test.db').as_posix(), auto_gamma=True)
+    assert reconstructed_df.loc[1, "Obs1"] is None
+    assert np.all(reconstructed_df.loc[0] == my_df.loc[0])
+    assert np.all(reconstructed_df.loc[2:] == my_df.loc[2:])
+
+
 def test_df_Corr(tmp_path):
 
     my_corr = pe.Corr([pe.pseudo_Obs(-0.48, 0.04, "test"), pe.pseudo_Obs(-0.154, 0.03, "test")])
 
     my_dict = {"int": 1,
-           "float": -0.01,
-           "Corr": my_corr}
+               "float": -0.01,
+               "Corr": my_corr}
     my_df = pd.DataFrame([my_dict] * 5)
 
     pe.input.pandas.dump_df(my_df, (tmp_path / 'df_output').as_posix())
@@ -76,8 +231,8 @@ def test_sql_if_exists_fail(tmp_path):
 
 def test_Obs_list_sql(tmp_path):
     my_dict = {"int": 1,
-           "Obs1": pe.pseudo_Obs(17, 11, "test_sql_if_exists_failnsemble"),
-           "Obs_list": [[pe.pseudo_Obs(0.0, 0.1, "test_ensemble2"), pe.pseudo_Obs(3.2, 1.1, "test_ensemble2")]]}
+               "Obs1": pe.pseudo_Obs(17, 11, "test_sql_if_exists_failnsemble"),
+               "Obs_list": [[pe.pseudo_Obs(0.0, 0.1, "test_ensemble2"), pe.pseudo_Obs(3.2, 1.1, "test_ensemble2")]]}
     pe_df = pd.DataFrame(my_dict)
     my_db = (tmp_path / "test_db.sqlite").as_posix()
     pe.input.pandas.to_sql(pe_df, "My_table", my_db)