Repository 'sklearn_data_preprocess'
hg clone https://toolshed.g2.bx.psu.edu/repos/bgruening/sklearn_data_preprocess

Changeset 29:66df2aa6cd6b (2019-11-01)
Previous changeset 28:50b3e080cef0 (2019-10-02) Next changeset 30:c72131b8fc7a (2019-11-07)
Commit message:
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
modified:
main_macros.xml
stacking_ensembles.py
added:
fitted_model_eval.py
simple_model_fit.py
test-data/fitted_model_eval01.tabular
test-data/model_fit01
test-data/model_fit02
test-data/model_fit02.h5
test-data/regression_y_split_test01.tabular
test-data/train_test_split_test01.tabular
test-data/train_test_split_test02.tabular
test-data/train_test_split_test03.tabular
test-data/train_test_split_train01.tabular
test-data/train_test_split_train02.tabular
test-data/train_test_split_train03.tabular
train_test_split.py
b
diff -r 50b3e080cef0 -r 66df2aa6cd6b fitted_model_eval.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fitted_model_eval.py Fri Nov 01 17:13:42 2019 -0400
[
@@ -0,0 +1,160 @@
+import argparse
+import json
+import pandas as pd
+import warnings
+
+from scipy.io import mmread
+from sklearn.pipeline import Pipeline
+from sklearn.metrics.scorer import _check_multimetric_scoring
+from sklearn.model_selection._validation import _score
+from galaxy_ml.utils import get_scoring, load_model, read_columns
+
+
+def _get_X_y(params, infile1, infile2):
+    """ read from inputs and output X and y
+
+    Parameters
+    ----------
+    params : dict
+        Tool inputs parameter
+    infile1 : str
+        File path to dataset containing features
+    infile2 : str
+        File path to dataset containing target values
+
+    """
+    # store read dataframe object
+    loaded_df = {}
+
+    input_type = params['input_options']['selected_input']
+    # tabular input
+    if input_type == 'tabular':
+        header = 'infer' if params['input_options']['header1'] else None
+        column_option = (params['input_options']['column_selector_options_1']
+                         ['selected_column_selector_option'])
+        if column_option in ['by_index_number', 'all_but_by_index_number',
+                             'by_header_name', 'all_but_by_header_name']:
+            c = params['input_options']['column_selector_options_1']['col1']
+        else:
+            c = None
+
+        df_key = infile1 + repr(header)
+        df = pd.read_csv(infile1, sep='\t', header=header,
+                         parse_dates=True)
+        loaded_df[df_key] = df
+
+        X = read_columns(df, c=c, c_option=column_option).astype(float)
+    # sparse input
+    elif input_type == 'sparse':
+        X = mmread(open(infile1, 'r'))
+
+    # Get target y
+    header = 'infer' if params['input_options']['header2'] else None
+    column_option = (params['input_options']['column_selector_options_2']
+                     ['selected_column_selector_option2'])
+    if column_option in ['by_index_number', 'all_but_by_index_number',
+                         'by_header_name', 'all_but_by_header_name']:
+        c = params['input_options']['column_selector_options_2']['col2']
+    else:
+        c = None
+
+    df_key = infile2 + repr(header)
+    if df_key in loaded_df:
+        infile2 = loaded_df[df_key]
+    else:
+        infile2 = pd.read_csv(infile2, sep='\t',
+                              header=header, parse_dates=True)
+        loaded_df[df_key] = infile2
+
+    y = read_columns(
+            infile2,
+            c=c,
+            c_option=column_option,
+            sep='\t',
+            header=header,
+            parse_dates=True)
+    if len(y.shape) == 2 and y.shape[1] == 1:
+        y = y.ravel()
+
+    return X, y
+
+
+def main(inputs, infile_estimator, outfile_eval,
+         infile_weights=None, infile1=None,
+         infile2=None):
+    """
+    Parameter
+    ---------
+    inputs : str
+        File path to galaxy tool parameter
+
+    infile_estimator : strgit
+        File path to trained estimator input
+
+    outfile_eval : str
+        File path to save the evalulation results, tabular
+
+    infile_weights : str
+        File path to weights input
+
+    infile1 : str
+        File path to dataset containing features
+
+    infile2 : str
+        File path to dataset containing target values
+    """
+    warnings.filterwarnings('ignore')
+
+    with open(inputs, 'r') as param_handler:
+        params = json.load(param_handler)
+
+    X_test, y_test = _get_X_y(params, infile1, infile2)
+
+    # load model
+    with open(infile_estimator, 'rb') as est_handler:
+        estimator = load_model(est_handler)
+
+    main_est = estimator
+    if isinstance(estimator, Pipeline):
+        main_est = estimator.steps[-1][-1]
+    if hasattr(main_est, 'config') and hasattr(main_est, 'load_weights'):
+        if not infile_weights or infile_weights == 'None':
+            raise ValueError("The selected model skeleton asks for weights, "
+                             "but no dataset for weights was provided!")
+        main_est.load_weights(infile_weights)
+
+    # handle scorer, convert to scorer dict
+    scoring = params['scoring']
+    scorer = get_scoring(scoring)
+    scorer, _ = _check_multimetric_scoring(estimator, scoring=scorer)
+
+    if hasattr(estimator, 'evaluate'):
+        scores = estimator.evaluate(X_test, y_test=y_test,
+                                    scorer=scorer,
+                                    is_multimetric=True)
+    else:
+        scores = _score(estimator, X_test, y_test, scorer,
+                        is_multimetric=True)
+
+    # handle output
+    for name, score in scores.items():
+        scores[name] = [score]
+    df = pd.DataFrame(scores)
+    df = df[sorted(df.columns)]
+    df.to_csv(path_or_buf=outfile_eval, sep='\t',
+              header=True, index=False)
+
+
+if __name__ == '__main__':
+    aparser = argparse.ArgumentParser()
+    aparser.add_argument("-i", "--inputs", dest="inputs", required=True)
+    aparser.add_argument("-e", "--infile_estimator", dest="infile_estimator")
+    aparser.add_argument("-w", "--infile_weights", dest="infile_weights")
+    aparser.add_argument("-X", "--infile1", dest="infile1")
+    aparser.add_argument("-y", "--infile2", dest="infile2")
+    aparser.add_argument("-O", "--outfile_eval", dest="outfile_eval")
+    args = aparser.parse_args()
+
+    main(args.inputs, args.infile_estimator, args.outfile_eval,
+         infile_weights=args.infile_weights, infile1=args.infile1,
+         infile2=args.infile2)
b
diff -r 50b3e080cef0 -r 66df2aa6cd6b main_macros.xml
--- a/main_macros.xml Wed Oct 02 03:55:06 2019 -0400
+++ b/main_macros.xml Fri Nov 01 17:13:42 2019 -0400
b
@@ -328,8 +328,8 @@
 
   <!--Data interface-->
 
-  <xml name="samples_tabular" token_multiple1="false" token_multiple2="false">
-    <param name="infile1" type="data" format="tabular" label="Training samples dataset:"/>
+  <xml name="samples_tabular" token_label1="Training samples dataset:" token_multiple1="false" token_multiple2="false">
+    <param name="infile1" type="data" format="tabular" label="@LABEL1@"/>
     <param name="header1" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="False" label="Does the dataset contain header:" />
     <conditional name="column_selector_options_1">
       <expand macro="samples_column_selector_options" multiple="@MULTIPLE1@"/>
b
diff -r 50b3e080cef0 -r 66df2aa6cd6b simple_model_fit.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/simple_model_fit.py Fri Nov 01 17:13:42 2019 -0400
[
@@ -0,0 +1,145 @@
+import argparse
+import json
+import pandas as pd
+import pickle
+
+from galaxy_ml.utils import load_model, read_columns
+from sklearn.pipeline import Pipeline
+
+
+def _get_X_y(params, infile1, infile2):
+    """ read from inputs and output X and y
+
+    Parameters
+    ----------
+    params : dict
+        Tool inputs parameter
+    infile1 : str
+        File path to dataset containing features
+    infile2 : str
+        File path to dataset containing target values
+
+    """
+    # store read dataframe object
+    loaded_df = {}
+
+    input_type = params['input_options']['selected_input']
+    # tabular input
+    if input_type == 'tabular':
+        header = 'infer' if params['input_options']['header1'] else None
+        column_option = (params['input_options']['column_selector_options_1']
+                         ['selected_column_selector_option'])
+        if column_option in ['by_index_number', 'all_but_by_index_number',
+                             'by_header_name', 'all_but_by_header_name']:
+            c = params['input_options']['column_selector_options_1']['col1']
+        else:
+            c = None
+
+        df_key = infile1 + repr(header)
+        df = pd.read_csv(infile1, sep='\t', header=header,
+                         parse_dates=True)
+        loaded_df[df_key] = df
+
+        X = read_columns(df, c=c, c_option=column_option).astype(float)
+    # sparse input
+    elif input_type == 'sparse':
+        X = mmread(open(infile1, 'r'))
+
+    # Get target y
+    header = 'infer' if params['input_options']['header2'] else None
+    column_option = (params['input_options']['column_selector_options_2']
+                     ['selected_column_selector_option2'])
+    if column_option in ['by_index_number', 'all_but_by_index_number',
+                         'by_header_name', 'all_but_by_header_name']:
+        c = params['input_options']['column_selector_options_2']['col2']
+    else:
+        c = None
+
+    df_key = infile2 + repr(header)
+    if df_key in loaded_df:
+        infile2 = loaded_df[df_key]
+    else:
+        infile2 = pd.read_csv(infile2, sep='\t',
+                              header=header, parse_dates=True)
+        loaded_df[df_key] = infile2
+
+    y = read_columns(
+            infile2,
+            c=c,
+            c_option=column_option,
+            sep='\t',
+            header=header,
+            parse_dates=True)
+    if len(y.shape) == 2 and y.shape[1] == 1:
+        y = y.ravel()
+
+    return X, y
+
+
+def main(inputs, infile_estimator, infile1, infile2, out_object,
+         out_weights=None):
+    """ main
+
+    Parameters
+    ----------
+    inputs : str
+        File path to galaxy tool parameter
+
+    infile_estimator : str
+        File paths of input estimator
+
+    infile1 : str
+        File path to dataset containing features
+
+    infile2 : str
+        File path to dataset containing target labels
+
+    out_object : str
+        File path for output of fitted model or skeleton
+
+    out_weights : str
+        File path for output of weights
+
+    """
+    with open(inputs, 'r') as param_handler:
+        params = json.load(param_handler)
+
+    # load model
+    with open(infile_estimator, 'rb') as est_handler:
+        estimator = load_model(est_handler)
+
+    X_train, y_train = _get_X_y(params, infile1, infile2)
+
+    estimator.fit(X_train, y_train)
+    
+    main_est = estimator
+    if isinstance(main_est, Pipeline):
+        main_est = main_est.steps[-1][-1]
+    if hasattr(main_est, 'model_') \
+            and hasattr(main_est, 'save_weights'):
+        if out_weights:
+            main_est.save_weights(out_weights)
+        del main_est.model_
+        del main_est.fit_params
+        del main_est.model_class_
+        del main_est.validation_data
+        if getattr(main_est, 'data_generator_', None):
+            del main_est.data_generator_
+
+    with open(out_object, 'wb') as output_handler:
+        pickle.dump(estimator, output_handler,
+                    pickle.HIGHEST_PROTOCOL)
+
+
+if __name__ == '__main__':
+    aparser = argparse.ArgumentParser()
+    aparser.add_argument("-i", "--inputs", dest="inputs", required=True)
+    aparser.add_argument("-X", "--infile_estimator", dest="infile_estimator")
+    aparser.add_argument("-y", "--infile1", dest="infile1")
+    aparser.add_argument("-g", "--infile2", dest="infile2")
+    aparser.add_argument("-o", "--out_object", dest="out_object")
+    aparser.add_argument("-t", "--out_weights", dest="out_weights")
+    args = aparser.parse_args()
+
+    main(args.inputs, args.infile_estimator, args.infile1,
+         args.infile2, args.out_object, args.out_weights)
b
diff -r 50b3e080cef0 -r 66df2aa6cd6b stacking_ensembles.py
--- a/stacking_ensembles.py Wed Oct 02 03:55:06 2019 -0400
+++ b/stacking_ensembles.py Fri Nov 01 17:13:42 2019 -0400
[
@@ -82,7 +82,9 @@
 
     weights = options.pop('weights', None)
     if weights:
-        options['weights'] = ast.literal_eval(weights)
+        weights = ast.literal_eval(weights)
+        if weights:
+            options['weights'] = weights
 
     mod_and_name = estimator_type.split('_')
     mod = sys.modules[mod_and_name[0]]
b
diff -r 50b3e080cef0 -r 66df2aa6cd6b test-data/fitted_model_eval01.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fitted_model_eval01.tabular Fri Nov 01 17:13:42 2019 -0400
b
@@ -0,0 +1,2 @@
+score
+0.8277511130733235
b
diff -r 50b3e080cef0 -r 66df2aa6cd6b test-data/model_fit01
b
Binary file test-data/model_fit01 has changed
b
diff -r 50b3e080cef0 -r 66df2aa6cd6b test-data/model_fit02
b
Binary file test-data/model_fit02 has changed
b
diff -r 50b3e080cef0 -r 66df2aa6cd6b test-data/model_fit02.h5
b
Binary file test-data/model_fit02.h5 has changed
b
diff -r 50b3e080cef0 -r 66df2aa6cd6b test-data/regression_y_split_test01.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/regression_y_split_test01.tabular Fri Nov 01 17:13:42 2019 -0400
b
@@ -0,0 +1,67 @@
+actual
+57
+71
+75
+49
+66
+59
+68
+48
+46
+45
+67
+75
+79
+74
+60
+48
+77
+71
+85
+41
+75
+61
+76
+52
+46
+77
+88
+60
+68
+40
+89
+46
+49
+68
+57
+50
+68
+55
+64
+51
+77
+79
+42
+76
+54
+54
+59
+80
+55
+54
+54
+54
+54
+71
+56
+66
+61
+40
+71
+63
+78
+53
+75
+50
+72
+68
b
diff -r 50b3e080cef0 -r 66df2aa6cd6b test-data/train_test_split_test01.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/train_test_split_test01.tabular Fri Nov 01 17:13:42 2019 -0400
b
@@ -0,0 +1,67 @@
+year month day temp_2 temp_1 average forecast_noaa forecast_acc forecast_under friend week_Fri week_Mon week_Sat week_Sun week_Thurs week_Tues week_Wed
+2016 11 2 59 57 54.2 54 58 55 70 0 0 0 0 0 0 1
+2016 11 8 61 63 52.7 49 57 52 49 0 0 0 0 0 1 0
+2016 7 13 74 77 75.6 74 78 76 56 0 0 0 0 0 0 1
+2016 3 14 52 54 53.4 49 58 55 44 0 1 0 0 0 0 0
+2016 6 13 65 70 69.3 66 72 69 79 0 1 0 0 0 0 0
+2016 5 21 63 66 65.7 62 67 65 49 0 0 1 0 0 0 0
+2016 7 4 76 71 73.8 71 76 73 86 0 1 0 0 0 0 0
+2016 1 15 55 49 47.1 46 51 46 65 1 0 0 0 0 0 0
+2016 2 1 48 47 48.8 46 49 49 51 0 1 0 0 0 0 0
+2016 1 11 50 52 46.7 42 48 48 39 0 1 0 0 0 0 0
+2016 6 8 86 85 68.5 67 70 69 81 0 0 0 0 0 0 1
+2016 7 23 81 71 77.0 75 81 76 86 0 0 1 0 0 0 0
+2016 9 14 74 75 71.2 67 75 73 77 0 0 0 0 0 0 1
+2016 9 12 77 70 71.8 67 73 73 90 0 1 0 0 0 0 0
+2016 10 17 62 60 59.1 57 63 59 62 0 1 0 0 0 0 0
+2016 1 19 50 54 47.6 47 49 48 53 0 0 0 0 0 1 0
+2016 9 26 67 76 67.2 64 69 69 74 0 1 0 0 0 0 0
+2016 9 15 75 79 71.0 66 76 69 64 0 0 0 0 1 0 0
+2016 7 28 79 83 77.3 76 80 78 76 0 0 0 0 1 0 0
+2016 12 24 45 40 45.1 44 47 46 39 0 0 1 0 0 0 0
+2016 6 1 71 79 67.4 65 69 66 58 0 0 0 0 0 0 1
+2016 10 3 63 65 64.5 63 68 65 49 0 1 0 0 0 0 0
+2016 4 8 68 77 57.1 57 61 57 41 1 0 0 0 0 0 0
+2016 11 17 55 50 50.5 46 51 50 57 0 0 0 0 1 0 0
+2016 12 4 50 49 46.8 45 47 47 53 0 0 0 1 0 0 0
+2016 9 10 72 74 72.3 70 77 74 91 0 0 1 0 0 0 0
+2016 7 29 83 85 77.3 77 80 79 77 1 0 0 0 0 0 0
+2016 10 14 66 60 60.2 56 64 60 78 1 0 0 0 0 0 0
+2016 3 30 56 64 55.7 51 57 56 57 0 0 0 0 0 0 1
+2016 12 5 49 46 46.6 43 50 45 65 0 1 0 0 0 0 0
+2016 4 18 68 77 58.8 55 59 57 39 0 1 0 0 0 0 0
+2016 12 19 35 39 45.1 42 46 45 51 0 1 0 0 0 0 0
+2016 2 4 51 49 49.0 44 54 51 44 0 0 0 0 1 0 0
+2016 4 30 64 61 61.4 60 65 62 78 0 0 1 0 0 0 0
+2016 4 5 69 60 56.6 52 58 56 72 0 0 0 0 0 1 0
+2016 11 16 57 55 50.7 50 51 49 34 0 0 0 0 0 0 1
+2016 9 28 77 69 66.5 66 68 66 62 0 0 0 0 0 0 1
+2016 1 13 45 49 46.9 45 51 46 33 0 0 0 0 0 0 1
+2016 3 5 59 57 52.1 49 53 51 46 0 0 1 0 0 0 0
+2016 1 24 57 48 48.1 46 50 48 54 0 0 0 1 0 0 0
+2016 7 14 77 75 75.8 74 76 77 77 0 0 0 0 1 0 0
+2016 8 23 84 81 75.7 73 78 77 89 0 0 0 0 0 1 0
+2016 12 25 40 41 45.1 42 49 44 31 0 0 0 1 0 0 0
+2016 9 25 64 67 67.6 64 72 67 62 0 0 0 1 0 0 0
+2016 11 21 57 55 49.5 46 51 49 67 0 1 0 0 0 0 0
+2016 1 16 49 48 47.3 45 52 46 28 0 0 1 0 0 0 0
+2016 2 24 51 60 50.8 47 53 50 46 0 0 0 0 0 0 1
+2016 8 4 73 75 77.3 73 79 78 66 0 0 0 0 1 0 0
+2016 3 2 54 58 51.6 47 54 52 37 0 0 0 0 0 0 1
+2016 1 25 48 51 48.2 45 51 49 63 0 1 0 0 0 0 0
+2016 1 18 54 50 47.5 44 48 49 58 0 1 0 0 0 0 0
+2016 11 22 55 54 49.3 46 54 49 58 0 0 0 0 0 1 0
+2016 3 13 55 52 53.3 50 55 53 54 0 0 0 1 0 0 0
+2016 5 17 57 60 65.0 62 65 65 55 0 0 0 0 0 1 0
+2016 1 28 56 57 48.4 44 52 48 34 0 0 0 0 1 0 0
+2016 5 24 66 65 66.2 66 71 66 67 0 0 0 0 0 1 0
+2016 11 6 65 58 53.2 52 57 55 71 0 0 0 1 0 0 0
+2016 12 23 49 45 45.1 45 49 44 35 1 0 0 0 0 0 0
+2016 6 25 68 69 71.7 68 73 73 89 0 0 1 0 0 0 0
+2016 4 2 73 71 56.2 55 58 58 45 0 0 1 0 0 0 0
+2016 6 26 69 71 71.9 67 74 72 70 0 0 0 1 0 0 0
+2016 11 26 52 52 48.4 48 50 47 58 0 0 1 0 0 0 0
+2016 9 13 70 74 71.5 71 75 70 82 0 0 0 0 0 1 0
+2016 12 2 52 46 47.2 46 51 49 41 1 0 0 0 0 0 0
+2016 8 6 80 79 77.2 76 81 79 60 0 0 1 0 0 0 0
+2016 10 29 60 65 55.3 55 59 55 65 0 0 1 0 0 0 0
b
diff -r 50b3e080cef0 -r 66df2aa6cd6b test-data/train_test_split_test02.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/train_test_split_test02.tabular Fri Nov 01 17:13:42 2019 -0400
b
b'@@ -0,0 +1,201 @@\n+-1.3022497239876525\t0.41162245619920174\t0.3850631031897158\t-1.065301842496646\t-0.6940008550138481\t2.2608403458600925\t3.622204434814536\t-0.3183465181327487\t-1.410027169684386\t-0.6307904628990526\t2.809174035044597\t0.7840390953413314\t-0.032913359309272236\t0.1269040356918228\t-0.7038487276500461\t-1.5433857418796189\t-0.2658388398378144\t-1.204125138751038\t-0.4106305941465671\t-2.1530032168711024\n+-0.4107989913365759\t0.9675376475353166\t0.09374211379388764\t1.7143886101095047\t-0.11156554775507473\t1.6257337330303492\t5.671063244915109\t-0.3775968070412295\t0.8772742813833009\t-0.2249373445476654\t3.541130040089443\t0.7064690478674034\t0.3274452454361061\t0.4095309780710557\t-0.04020259217468653\t0.3999351212624621\t-0.4789427070381956\t-0.8383398308678357\t-0.7084990898469742\t-3.5921789270343747\n+-1.0046430489468259\t-0.2475198782602121\t1.8722558073924007\t-2.050734120852677\t0.223218415351888\t0.9972967022037826\t0.21687494749301134\t0.6815453371376522\t-1.2369792180109709\t-1.7937590177703913\t-0.595814082168741\t-0.3714655242486308\t0.8054558366241785\t0.707291290265989\t0.0026761403473940892\t0.6858925338135025\t1.0460915051165451\t-1.05529607831364\t-0.8524278739013349\t-1.0937845388370384\n+-0.6601752137721719\t-0.11000001206134824\t-2.1153815467792265\t0.7939530261454807\t0.14074473863377998\t3.3552079891275923\t-0.8369407002892686\t-0.5714820686564377\t-0.37412481389886265\t0.16669033299410288\t-3.6319951227966674\t-0.6639361788987586\t0.5554669721932757\t0.7479717178718552\t-0.016560794142802523\t0.19859811525823087\t-1.9152321429437595\t-0.4582315336475037\t-2.2285961423670955\t-3.4228140259065998\n+0.7866152217561416\t-0.2291058850235269\t-0.3527520240499313\t0.6723966958156411\t-1.6682659534205586\t2.7789914613781272\t1.906164582945605\t1.0761421124464927\t0.09690167407822936\t1.6513613104097675\t2.2258330065926084\t-0.8734144600762542\t-1.0066865968249934\t-0.13471591695058407\t0.015184991621273526\t0.41810514195584253\t-0.3760878884398714\t2.2903405971801156\t1.0522116184673187\t-0.9159796436696128\n+0.2814798326149793\t0.5875101493421397\t0.21729777590682087\t-1.485801637332555\t-0.7259055545195056\t2.3934625979413915\t2.795967841759341\t0.1748287231468569\t0.7064308999942802\t0.3497777551584115\t2.225996647861514\t1.6301969056059509\t0.07651250932855069\t-2.0342494286984243\t-0.8883453790706329\t-0.7345168234009436\t1.5287683026280032\t-0.4421021715011357\t-0.5779836284098872\t-1.8023368901730872\n+0.023561266296767996\t0.01327469130218088\t0.9878045214079304\t0.5750648387066529\t0.4047426855593061\t2.730429552257033\t1.0141221327309589\t-0.0010397698579166187\t1.2950034987670118\t-1.805850216908488\t1.6388229124609937\t0.9286520099757948\t-0.34109406603463605\t-0.02757550682732839\t-1.2286674947471106\t0.8011744540858317\t0.8424403652177841\t-0.14115310456128674\t-0.44894002007093775\t-0.4406268508179094\n+0.2456307272179787\t0.5943091746736674\t-1.273655669405128\t0.16873404654912996\t0.005752441478044986\t0.5666353702678641\t4.842127705182824\t0.698622620435285\t1.2592032824188062\t-1.3867865971369038\t2.0103146282963\t0.25453278665231965\t1.037764245051936\t-0.14900969999222113\t-1.3508991449570242\t-0.6347960472728013\t0.01478239489509124\t0.1237920700532843\t-0.8008367439748938\t-3.7595616099202216\n+-1.4928016688154506\t0.6922526483668314\t0.7340706436196134\t0.3473096338667893\t-0.2626210985357605\t3.4791405788113354\t1.805377038112414\t1.3002542896922045\t-0.9818090439589664\t-1.983507863053584\t3.1109989936861995\t-1.5167130756726412\t2.115406032275567\t-0.06319774436121431\t0.31045881394126296\t1.5773205208380376\t0.11953451934790252\t-0.3678585275873511\t-0.6436336614328086\t-0.1923418873135878\n+-1.1092740315883938\t-0.9086267440397304\t-0.9317250076628589\t0.10305857018240576\t0.569614735498199\t3.3180899169801226\t-0.12789255109919928\t-0.225656531827112\t-0.6679424977863244\t0.4743665910531477\t-1.90983381933296\t-0.015442113772508715\t0.7947216167107651\t0.8564724155111614\t0.7221596369993102\t-0.9866727547841551\t0.8360620842096383\t0.6950101534147096\t0.04441865129686528\t-2.6156995904444718\n+1.0098923348657989\t-0.3404395572391499\t0.28768679961742755\t-'..b'394916943092\t-0.08986731773440884\t1.227196153928395\t0.9070135114981376\t-0.4301867214198333\t-1.4492302926076932\n+-0.06615816960203896\t2.009979529130306\t0.3713735532042358\t-0.5487484003197485\t2.3544159434087883\t1.8572881651916524\t3.3230022631402014\t0.3735478888166094\t-0.8598539493190498\t0.7274541656791573\t2.205532939957485\t0.29758553036688457\t0.8972227445878997\t-0.5747601621532991\t-0.2127621916795853\t0.040064364498694015\t0.5849058397345099\t0.8758434197897889\t0.4663260930810838\t-2.254363887228946\n+0.18543498213986814\t3.0513112038934844\t-2.6424015306921946\t0.8764021246988886\t-0.3953153229944255\t1.9075565797529936\t1.4218322330290696\t-0.5195408321168391\t0.5455073292906822\t0.6246218548016428\t0.9584355772452136\t-2.2635771383414567\t-0.6561863207944872\t0.8486496057693781\t-0.5966266151068456\t-0.6006020054228821\t2.0603605160777265\t0.11602230574467433\t0.4886550176001555\t-1.2835462572257614\n+-0.1582698552315506\t-0.08048346990253155\t-2.148011786893936\t2.047644705860473\t0.7947162744855929\t3.242804563537072\t3.1537786543701785\t0.5402497023814611\t0.4272506159045248\t-0.6354699283615589\t3.262065129084129\t-0.22929604213545826\t0.7154856008886161\t-0.2042624800307618\t-0.2578743486811804\t0.13661938345994426\t0.4553653167841669\t-0.6670519044995711\t-2.0893270217727435\t-1.499879266505479\n+-0.8484574739703497\t1.3067865576457078\t0.25715573889589005\t-0.5778920236798556\t1.2522052635779308\t2.5540397800380448\t3.62109581483752\t-0.32782688264878435\t0.7393667994651832\t-0.28375737263272044\t3.182336120597001\t0.6388288113204441\t0.6913878844603908\t-0.42013735166981375\t0.1445696954158848\t1.7972784288866317\t-1.3269163979305345\t-0.5374183207933991\t-1.1487633221563704\t-1.8939359370372515\n+-2.130317782257829\t0.6944206556053942\t-0.5187934367784872\t0.4910182874266096\t0.9821391691462148\t1.5947125814644691\t4.651398959285967\t-0.4079668226972564\t-0.7617607267021139\t0.37200223540319977\t2.9925378597902497\t0.3213832180477288\t-1.8009468379200382\t0.022873767566392908\t-0.5948190671258752\t-0.18142573586761535\t1.0527453107966451\t-0.7914376218356579\t-1.2023900300673969\t-2.9428283401869946\n+0.6749106319022494\t-0.14100011324901496\t0.9696745674485816\t-0.6012318064205764\t0.9706395894078412\t2.0205295534128647\t-0.5705109230704828\t1.107471162440306\t-0.2333200858753319\t0.5489383517969392\t-2.331823083983417\t0.5241875376117929\t-1.607427755534678\t1.2124152543792104\t0.25644841454138195\t0.5333111287645858\t-1.7715901663386604\t0.7643998152072085\t-1.088005122340949\t-2.120248490613845\n+1.0784246103336974\t0.6750275474270194\t0.883320881578071\t0.6851873084466028\t0.2463794964155742\t1.6240981608723588\t3.9093035073408418\t0.2591824998427575\t-1.6014038225855325\t1.1801464748015662\t2.4755532139585203\t0.7995931657601443\t1.6483349264511815\t-1.269517021279204\t0.7198065388081868\t-0.3671739224800498\t-0.7364785132472684\t-0.6205826123141913\t1.708837288406762\t-2.5947560181445284\n+0.010035987199388642\t0.2446441667110395\t1.245919130033156\t0.8854157890056191\t-1.573923287330914\t2.8875386799155955\t-0.513386992362383\t0.40135785761620013\t0.5076563896403061\t-0.20239357501585714\t-2.560644060182517\t-0.1450215571363124\t0.5199643185069369\t0.6728828829265034\t1.5303075053292063\t-0.9794419968244896\t0.3655133608469972\t-1.327131896650437\t-1.904372466358065\t-2.6555099509371605\n+-0.2984991804837332\t-1.6426421983629622\t-1.0099344497295062\t-0.20683063259480788\t1.7371391385934103\t1.9175803121382835\t2.5305082449767884\t0.6198917597202278\t-0.5024984291905042\t0.6767881974129001\t1.569111670968616\t-0.8206492678463314\t-0.35119699167786794\t1.0578552660085534\t-1.0111524265487517\t1.5038720931452612\t-0.7474037040854009\t0.6582529782133406\t0.7064620422956671\t-1.969356801153876\n+-0.6512454621212219\t-1.37373475613224\t0.30085906666200124\t0.0797497766512836\t-2.195376961647302\t1.132356514093129\t5.6861294740324535\t-0.1068624210733533\t0.4255497794528917\t-0.14106687226428918\t2.6052434613346884\t-0.01934725939162056\t1.0454590995696535\t-0.8660690232570448\t-1.29000104081957\t0.10819900014776096\t0.7755088867812867\t0.6015079687881466\t0.955602538442458\t-4.328064444458374\n'
b
diff -r 50b3e080cef0 -r 66df2aa6cd6b test-data/train_test_split_test03.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/train_test_split_test03.tabular Fri Nov 01 17:13:42 2019 -0400
b
@@ -0,0 +1,54 @@
+year month day temp_2 temp_1 average forecast_noaa forecast_acc forecast_under friend week_Fri week_Mon week_Sat week_Sun week_Thurs week_Tues week_Wed
+2016 9 19 68 69 69.7 65 74 71 88 0 1 0 0 0 0 0
+2016 1 25 48 51 48.2 45 51 49 63 0 1 0 0 0 0 0
+2016 12 17 39 35 45.2 43 47 46 38 0 0 1 0 0 0 0
+2016 7 17 76 72 76.3 76 78 77 88 0 0 0 1 0 0 0
+2016 6 27 71 78 72.2 70 74 72 84 0 1 0 0 0 0 0
+2016 4 17 60 68 58.6 58 62 59 54 0 0 0 1 0 0 0
+2016 11 2 59 57 54.2 54 58 55 70 0 0 0 0 0 0 1
+2016 12 27 42 42 45.2 41 50 47 47 0 0 0 0 0 1 0
+2016 1 16 49 48 47.3 45 52 46 28 0 0 1 0 0 0 0
+2016 12 7 40 42 46.3 44 51 46 62 0 0 0 0 0 0 1
+2016 8 28 81 79 75.0 71 77 76 85 0 0 0 1 0 0 0
+2016 10 19 60 61 58.4 58 60 57 41 0 0 0 0 0 0 1
+2016 5 5 74 60 62.5 58 66 62 56 0 0 0 0 1 0 0
+2016 12 11 36 44 45.7 41 46 47 35 0 0 0 1 0 0 0
+2016 3 30 56 64 55.7 51 57 56 57 0 0 0 0 0 0 1
+2016 10 9 64 68 62.1 58 65 63 55 0 0 0 1 0 0 0
+2016 1 12 52 45 46.8 44 50 45 61 0 0 0 0 0 1 0
+2016 8 13 80 87 76.8 73 79 78 73 0 0 1 0 0 0 0
+2016 9 23 68 67 68.3 67 69 67 61 1 0 0 0 0 0 0
+2016 6 16 60 67 69.8 68 72 71 87 0 0 0 0 1 0 0
+2016 9 8 68 67 72.8 69 77 73 56 0 0 0 0 1 0 0
+2016 12 4 50 49 46.8 45 47 47 53 0 0 0 1 0 0 0
+2016 1 13 45 49 46.9 45 51 46 33 0 0 0 0 0 0 1
+2016 2 5 49 49 49.1 47 50 49 45 1 0 0 0 0 0 0
+2016 6 22 76 73 71.0 66 71 72 78 0 0 0 0 0 0 1
+2016 5 25 65 66 66.4 65 67 66 60 0 0 0 0 0 0 1
+2016 4 8 68 77 57.1 57 61 57 41 1 0 0 0 0 0 0
+2016 10 11 57 60 61.4 58 66 61 58 0 0 0 0 0 1 0
+2016 11 4 57 65 53.7 49 55 54 38 1 0 0 0 0 0 0
+2016 11 30 52 52 47.6 47 52 49 44 0 0 0 0 0 0 1
+2016 8 4 73 75 77.3 73 79 78 66 0 0 0 0 1 0 0
+2016 9 20 69 71 69.4 67 73 69 81 0 0 0 0 0 1 0
+2016 2 19 57 53 50.2 50 52 51 42 1 0 0 0 0 0 0
+2016 9 4 70 67 73.7 72 77 75 64 0 0 0 1 0 0 0
+2016 10 4 65 61 64.1 62 69 65 60 0 0 0 0 0 1 0
+2016 5 21 63 66 65.7 62 67 65 49 0 0 1 0 0 0 0
+2016 1 9 45 48 46.4 46 50 45 47 0 0 1 0 0 0 0
+2016 8 3 77 73 77.3 77 81 77 93 0 0 0 0 0 0 1
+2016 10 7 66 63 62.9 62 67 64 78 1 0 0 0 0 0 0
+2016 10 17 62 60 59.1 57 63 59 62 0 1 0 0 0 0 0
+2016 6 18 71 67 70.2 67 75 69 77 0 0 1 0 0 0 0
+2016 12 26 41 42 45.2 45 48 46 58 0 1 0 0 0 0 0
+2016 11 20 55 57 49.8 47 54 48 30 0 0 0 1 0 0 0
+2016 2 22 53 51 50.6 46 51 50 59 0 1 0 0 0 0 0
+2016 6 26 69 71 71.9 67 74 72 70 0 0 0 1 0 0 0
+2016 7 11 71 74 75.3 74 79 75 71 0 1 0 0 0 0 0
+2016 6 21 70 76 70.8 68 75 71 57 0 0 0 0 0 1 0
+2016 3 2 54 58 51.6 47 54 52 37 0 0 0 0 0 0 1
+2016 6 12 67 65 69.1 65 73 70 83 0 0 0 1 0 0 0
+2016 5 13 81 77 64.3 63 67 66 67 1 0 0 0 0 0 0
+2016 4 12 59 58 57.7 54 59 57 61 0 0 0 0 0 1 0
+2016 10 14 66 60 60.2 56 64 60 78 1 0 0 0 0 0 0
+2016 4 15 59 59 58.3 58 61 60 40 1 0 0 0 0 0 0
b
diff -r 50b3e080cef0 -r 66df2aa6cd6b test-data/train_test_split_train01.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/train_test_split_train01.tabular Fri Nov 01 17:13:42 2019 -0400
b
b'@@ -0,0 +1,196 @@\n+year\tmonth\tday\ttemp_2\ttemp_1\taverage\tforecast_noaa\tforecast_acc\tforecast_under\tfriend\tweek_Fri\tweek_Mon\tweek_Sat\tweek_Sun\tweek_Thurs\tweek_Tues\tweek_Wed\n+2016\t4\t11\t66\t59\t57.6\t56\t60\t58\t40\t0\t1\t0\t0\t0\t0\t0\n+2016\t11\t7\t58\t61\t52.9\t51\t56\t51\t35\t0\t1\t0\t0\t0\t0\t0\n+2016\t5\t12\t75\t81\t64.1\t62\t67\t63\t81\t0\t0\t0\t0\t1\t0\t0\n+2016\t3\t31\t64\t68\t55.9\t55\t59\t56\t56\t0\t0\t0\t0\t1\t0\t0\n+2016\t12\t28\t42\t47\t45.3\t41\t49\t44\t58\t0\t0\t0\t0\t0\t0\t1\n+2016\t5\t30\t64\t64\t67.1\t64\t70\t66\t69\t0\t1\t0\t0\t0\t0\t0\n+2016\t7\t17\t76\t72\t76.3\t76\t78\t77\t88\t0\t0\t0\t1\t0\t0\t0\n+2016\t10\t1\t66\t67\t65.3\t64\t70\t64\t54\t0\t0\t1\t0\t0\t0\t0\n+2016\t2\t7\t53\t49\t49.2\t46\t51\t48\t63\t0\t0\t0\t1\t0\t0\t0\n+2016\t5\t13\t81\t77\t64.3\t63\t67\t66\t67\t1\t0\t0\t0\t0\t0\t0\n+2016\t8\t28\t81\t79\t75.0\t71\t77\t76\t85\t0\t0\t0\t1\t0\t0\t0\n+2016\t2\t8\t49\t51\t49.3\t49\t52\t50\t34\t0\t1\t0\t0\t0\t0\t0\n+2016\t10\t20\t61\t58\t58.1\t58\t59\t58\t43\t0\t0\t0\t0\t1\t0\t0\n+2016\t4\t24\t64\t65\t60.1\t57\t61\t60\t41\t0\t0\t0\t1\t0\t0\t0\n+2016\t12\t31\t48\t57\t45.5\t42\t48\t47\t57\t0\t0\t1\t0\t0\t0\t0\n+2016\t10\t11\t57\t60\t61.4\t58\t66\t61\t58\t0\t0\t0\t0\t0\t1\t0\n+2016\t2\t11\t62\t56\t49.5\t46\t53\t50\t37\t0\t0\t0\t0\t1\t0\t0\n+2016\t12\t3\t46\t50\t47.0\t42\t52\t47\t58\t0\t0\t1\t0\t0\t0\t0\n+2016\t7\t9\t68\t74\t74.9\t70\t79\t76\t60\t0\t0\t1\t0\t0\t0\t0\n+2016\t9\t3\t75\t70\t73.9\t71\t75\t73\t68\t0\t0\t1\t0\t0\t0\t0\n+2016\t2\t25\t60\t59\t50.9\t49\t51\t49\t35\t0\t0\t0\t0\t1\t0\t0\n+2016\t4\t19\t77\t89\t59.0\t59\t63\t59\t61\t0\t0\t0\t0\t0\t1\t0\n+2016\t7\t12\t74\t74\t75.4\t74\t77\t77\t71\t0\t0\t0\t0\t0\t1\t0\n+2016\t6\t22\t76\t73\t71.0\t66\t71\t72\t78\t0\t0\t0\t0\t0\t0\t1\n+2016\t8\t9\t72\t73\t77.1\t77\t80\t79\t94\t0\t0\t0\t0\t0\t1\t0\n+2016\t9\t4\t70\t67\t73.7\t72\t77\t75\t64\t0\t0\t0\t1\t0\t0\t0\n+2016\t5\t10\t63\t67\t63.6\t61\t66\t64\t68\t0\t0\t0\t0\t0\t1\t0\n+2016\t8\t30\t79\t75\t74.6\t74\t76\t75\t63\t0\t0\t0\t0\t0\t1\t0\n+2016\t6\t14\t70\t66\t69.5\t66\t71\t69\t85\t0\t0\t0\t0\t0\t1\t0\n+2016\t2\t5\t49\t49\t49.1\t47\t50\t49\t45\t1\t0\t0\t0\t0\t0\t0\n+2016\t11\t14\t59\t55\t51.2\t49\t53\t53\t42\t0\t1\t0\t0\t0\t0\t0\n+2016\t4\t12\t59\t58\t57.7\t54\t59\t57\t61\t0\t0\t0\t0\t0\t1\t0\n+2016\t3\t8\t60\t53\t52.5\t48\t56\t51\t70\t0\t0\t0\t0\t0\t1\t0\n+2016\t4\t28\t60\t61\t61.0\t56\t65\t62\t73\t0\t0\t0\t0\t1\t0\t0\n+2016\t7\t27\t85\t79\t77.3\t73\t78\t79\t79\t0\t0\t0\t0\t0\t0\t1\n+2016\t1\t1\t45\t45\t45.6\t43\t50\t44\t29\t1\t0\t0\t0\t0\t0\t0\n+2016\t7\t24\t71\t75\t77.1\t76\t78\t78\t75\t0\t0\t0\t1\t0\t0\t0\n+2016\t1\t21\t48\t52\t47.8\t43\t51\t46\t57\t0\t0\t0\t0\t1\t0\t0\n+2016\t5\t4\t87\t74\t62.3\t59\t65\t64\t61\t0\t0\t0\t0\t0\t0\t1\n+2016\t2\t21\t51\t53\t50.5\t49\t54\t52\t46\t0\t0\t0\t1\t0\t0\t0\n+2016\t4\t16\t59\t60\t58.5\t56\t60\t59\t59\t0\t0\t1\t0\t0\t0\t0\n+2016\t6\t12\t67\t65\t69.1\t65\t73\t70\t83\t0\t0\t0\t1\t0\t0\t0\n+2016\t12\t17\t39\t35\t45.2\t43\t47\t46\t38\t0\t0\t1\t0\t0\t0\t0\n+2016\t12\t16\t39\t39\t45.3\t44\t49\t44\t39\t1\t0\t0\t0\t0\t0\t0\n+2016\t10\t16\t60\t62\t59.5\t57\t60\t59\t40\t0\t0\t0\t1\t0\t0\t0\n+2016\t8\t8\t72\t72\t77.1\t76\t78\t77\t65\t0\t1\t0\t0\t0\t0\t0\n+2016\t2\t16\t58\t55\t49.9\t47\t54\t51\t55\t0\t0\t0\t0\t0\t1\t0\n+2016\t6\t18\t71\t67\t70.2\t67\t75\t69\t77\t0\t0\t1\t0\t0\t0\t0\n+2016\t2\t15\t55\t58\t49.9\t46\t52\t49\t53\t0\t1\t0\t0\t0\t0\t0\n+2016\t1\t22\t52\t52\t47.9\t47\t48\t48\t60\t1\t0\t0\t0\t0\t0\t0\n+2016\t11\t3\t57\t57\t53.9\t53\t54\t54\t35\t0\t0\t0\t0\t1\t0\t0\n+2016\t5\t20\t64\t63\t65.6\t63\t70\t64\t73\t1\t0\t0\t0\t0\t0\t0\n+2016\t2\t22\t53\t51\t50.6\t46\t51\t50\t59\t0\t1\t0\t0\t0\t0\t0\n+2016\t5\t27\t66\t65\t66.7\t64\t67\t68\t73\t1\t0\t0\t0\t0\t0\t0\n+2016\t3\t18\t53\t58\t54.0\t51\t57\t54\t56\t1\t0\t0\t0\t0\t0\t0\n+2016\t9\t30\t68\t66\t65.7\t64\t67\t65\t74\t1\t0\t0\t0\t0\t0\t0\n+2016\t12\t15\t40\t39\t45.3\t45\t49\t47\t46\t0\t0\t0\t0\t1\t0\t0\n+2016\t6\t21\t70\t76\t70.8\t68\t75\t71\t57\t0\t0\t0\t0\t0\t1\t0\n+2016\t9\t21\t71\t67\t69.0\t65\t70\t70\t76\t0\t0\t0\t0\t0\t0\t1\n+2016\t1\t14\t49\t55\t47.0\t43\t47\t46\t58\t0\t0\t0\t0\t1\t0\t0\n+2016\t7\t20\t73\t78\t76.7\t75\t78\t77\t66\t0\t0\t0\t0\t0\t0\t1\n+2016\t8\t5\t75\t80\t77.3\t75\t81\t78\t71\t1\t0\t0\t0\t0\t0\t0\n+2016\t9\t22\t67\t68\t68.7\t65\t70\t69\t56\t0\t0\t0\t0\t1\t0\t0\n+2016\t2\t19\t57\t53\t50.2\t50\t52\t51\t42\t1\t0\t0\t0\t0\t0\t0\n+2016\t6\t2\t79\t75\t67.6\t64\t71\t67\t77\t0\t0\t0\t0\t1\t0\t0\n+2016\t3\t11\t55\t56\t53.0\t53\t53\t51\t36\t1\t0\t0\t0\t0\t0\t0\n+2016\t3\t12\t56\t55\t53.1\t52\t58\t53\t65\t0\t0\t1\t0\t0\t0\t0\n+2016\t11\t29\t48\t52\t47.8\t43\t48\t47\t50\t0\t0\t0\t0\t0\t1\t0\n+2016\t9\t9\t67\t72\t72.6\t68\t77\t71\t78\t1\t0\t0\t0\t0\t0\t0\n+2016\t2\t10\t57\t62\t49.4\t48\t50\t49\t30\t0\t0\t0\t0\t0\t0\t1\n+2016\t9\t19\t68\t69\t69.7\t65\t74\t71\t88\t0\t1\t0\t0\t0\t0\t0\n+2016\t3\t25\t53\t54\t55.0\t53\t57\t57\t42\t1\t0\t0\t0\t0\t0\t0\n+2016\t9\t24\t67\t64\t68.0\t65\t71\t66\t64\t0\t0\t1\t0\t0\t0\t0\n+2016\t10\t19\t60\t61\t58.4\t58\t60\t57\t41\t0\t0\t0\t0\t0\t0\t1\n+2016\t10\t13\t62\t66\t60.6\t60\t62\t60\t57\t0\t0\t0\t0\t1\t0\t0\n+2016\t10\t24\t62\t62\t56.8\t52\t61\t57\t70\t0\t1\t0\t0\t0\t0\t0\n+2016\t7\t22\t82\t81\t76.9\t72\t77\t76\t70\t1\t0\t0\t0\t0\t0\t0\n+2016\t8\t3\t77\t73\t77.3\t77\t81\t77\t93\t0\t0\t0\t0\t0\t0\t1\n+2016\t10\t31\t65\t117\t54.8\t51\t59\t56\t62\t0\t1\t0\t0\t0\t0\t0\n+2016\t5\t14\t77\t82\t64.5\t64\t66\t66\t65\t0\t'..b'8\t66\t0\t0\t0\t0\t1\t0\t0\n+2016\t3\t23\t56\t57\t54.7\t50\t58\t55\t70\t0\t0\t0\t0\t0\t0\t1\n+2016\t7\t7\t69\t76\t74.4\t73\t77\t74\t72\t0\t0\t0\t0\t1\t0\t0\n+2016\t2\t20\t53\t51\t50.4\t48\t55\t51\t43\t0\t0\t1\t0\t0\t0\t0\n+2016\t5\t26\t66\t66\t66.5\t64\t70\t65\t85\t0\t0\t0\t0\t1\t0\t0\n+2016\t11\t11\t65\t64\t51.9\t50\t53\t52\t55\t1\t0\t0\t0\t0\t0\t0\n+2016\t12\t12\t44\t44\t45.6\t43\t50\t45\t42\t0\t1\t0\t0\t0\t0\t0\n+2016\t1\t9\t45\t48\t46.4\t46\t50\t45\t47\t0\t0\t1\t0\t0\t0\t0\n+2016\t11\t5\t65\t65\t53.4\t49\t58\t52\t41\t0\t0\t1\t0\t0\t0\t0\n+2016\t6\t9\t85\t67\t68.6\t66\t73\t69\t80\t0\t0\t0\t0\t1\t0\t0\n+2016\t1\t29\t57\t56\t48.5\t48\t52\t47\t49\t1\t0\t0\t0\t0\t0\t0\n+2016\t5\t11\t67\t75\t63.8\t62\t68\t63\t60\t0\t0\t0\t0\t0\t0\t1\n+2016\t5\t3\t77\t87\t62.1\t62\t66\t64\t69\t0\t0\t0\t0\t0\t1\t0\n+2016\t6\t27\t71\t78\t72.2\t70\t74\t72\t84\t0\t1\t0\t0\t0\t0\t0\n+2016\t8\t14\t87\t90\t76.7\t75\t78\t78\t65\t0\t0\t0\t1\t0\t0\t0\n+2016\t11\t30\t52\t52\t47.6\t47\t52\t49\t44\t0\t0\t0\t0\t0\t0\t1\n+2016\t7\t31\t88\t76\t77.4\t76\t78\t79\t95\t0\t0\t0\t1\t0\t0\t0\n+2016\t10\t26\t61\t65\t56.2\t53\t57\t57\t41\t0\t0\t0\t0\t0\t0\t1\n+2016\t12\t13\t44\t43\t45.5\t41\t47\t46\t46\t0\t0\t0\t0\t0\t1\t0\n+2016\t4\t1\t68\t73\t56.0\t54\t59\t55\t41\t1\t0\t0\t0\t0\t0\t0\n+2016\t11\t13\t63\t59\t51.4\t48\t56\t50\t64\t0\t0\t0\t1\t0\t0\t0\n+2016\t10\t8\t63\t64\t62.5\t60\t65\t61\t73\t0\t0\t1\t0\t0\t0\t0\n+2016\t5\t31\t64\t71\t67.3\t63\t72\t68\t85\t0\t0\t0\t0\t0\t1\t0\n+2016\t1\t27\t54\t56\t48.4\t45\t51\t49\t54\t0\t0\t0\t0\t0\t0\t1\n+2016\t4\t17\t60\t68\t58.6\t58\t62\t59\t54\t0\t0\t0\t1\t0\t0\t0\n+2016\t6\t30\t79\t74\t72.8\t71\t76\t72\t87\t0\t0\t0\t0\t1\t0\t0\n+2016\t4\t10\t76\t66\t57.4\t57\t60\t57\t60\t0\t0\t0\t1\t0\t0\t0\n+2016\t6\t23\t73\t75\t71.3\t68\t72\t71\t56\t0\t0\t0\t0\t1\t0\t0\n+2016\t7\t3\t76\t76\t73.5\t69\t76\t75\t85\t0\t0\t0\t1\t0\t0\t0\n+2016\t5\t5\t74\t60\t62.5\t58\t66\t62\t56\t0\t0\t0\t0\t1\t0\t0\n+2016\t5\t29\t64\t64\t67.0\t65\t71\t65\t76\t0\t0\t0\t1\t0\t0\t0\n+2016\t7\t10\t74\t71\t75.1\t71\t77\t76\t95\t0\t0\t0\t1\t0\t0\t0\n+2016\t5\t25\t65\t66\t66.4\t65\t67\t66\t60\t0\t0\t0\t0\t0\t0\t1\n+2016\t9\t20\t69\t71\t69.4\t67\t73\t69\t81\t0\t0\t0\t0\t0\t1\t0\n+2016\t10\t9\t64\t68\t62.1\t58\t65\t63\t55\t0\t0\t0\t1\t0\t0\t0\n+2016\t12\t27\t42\t42\t45.2\t41\t50\t47\t47\t0\t0\t0\t0\t0\t1\t0\n+2016\t4\t13\t58\t60\t57.9\t55\t62\t56\t77\t0\t0\t0\t0\t0\t0\t1\n+2016\t6\t16\t60\t67\t69.8\t68\t72\t71\t87\t0\t0\t0\t0\t1\t0\t0\n+2016\t9\t27\t76\t77\t66.8\t66\t67\t68\t64\t0\t0\t0\t0\t0\t1\t0\n+2016\t10\t7\t66\t63\t62.9\t62\t67\t64\t78\t1\t0\t0\t0\t0\t0\t0\n+2016\t6\t6\t81\t92\t68.2\t65\t70\t67\t71\t0\t1\t0\t0\t0\t0\t0\n+2016\t9\t7\t68\t68\t73.0\t72\t78\t71\t70\t0\t0\t0\t0\t0\t0\t1\n+2016\t3\t6\t57\t64\t52.2\t52\t53\t51\t49\t0\t0\t0\t1\t0\t0\t0\n+2016\t9\t23\t68\t67\t68.3\t67\t69\t67\t61\t1\t0\t0\t0\t0\t0\t0\n+2016\t7\t8\t76\t68\t74.6\t72\t79\t75\t77\t1\t0\t0\t0\t0\t0\t0\n+2016\t1\t3\t45\t44\t45.8\t43\t46\t47\t56\t0\t0\t0\t1\t0\t0\t0\n+2016\t7\t18\t72\t80\t76.4\t75\t77\t75\t66\t0\t1\t0\t0\t0\t0\t0\n+2016\t2\t23\t51\t51\t50.7\t49\t53\t51\t43\t0\t0\t0\t0\t0\t1\t0\n+2016\t10\t4\t65\t61\t64.1\t62\t69\t65\t60\t0\t0\t0\t0\t0\t1\t0\n+2016\t10\t27\t65\t58\t55.9\t51\t60\t55\t39\t0\t0\t0\t0\t1\t0\t0\n+2016\t6\t3\t75\t71\t67.7\t64\t71\t66\t55\t1\t0\t0\t0\t0\t0\t0\n+2016\t12\t30\t48\t48\t45.4\t44\t46\t44\t42\t1\t0\t0\t0\t0\t0\t0\n+2016\t10\t22\t62\t59\t57.4\t56\t59\t58\t44\t0\t0\t1\t0\t0\t0\t0\n+2016\t1\t7\t44\t51\t46.2\t45\t49\t46\t38\t0\t0\t0\t0\t1\t0\t0\n+2016\t8\t13\t80\t87\t76.8\t73\t79\t78\t73\t0\t0\t1\t0\t0\t0\t0\n+2016\t7\t30\t85\t88\t77.3\t75\t79\t77\t70\t0\t0\t1\t0\t0\t0\t0\n+2016\t12\t1\t52\t52\t47.4\t44\t48\t49\t39\t0\t0\t0\t0\t1\t0\t0\n+2016\t4\t9\t77\t76\t57.2\t53\t61\t57\t74\t0\t0\t1\t0\t0\t0\t0\n+2016\t11\t18\t50\t52\t50.3\t50\t53\t50\t35\t1\t0\t0\t0\t0\t0\t0\n+2016\t5\t8\t77\t82\t63.2\t62\t65\t63\t83\t0\t0\t0\t1\t0\t0\t0\n+2016\t1\t2\t44\t45\t45.7\t41\t50\t44\t61\t0\t0\t1\t0\t0\t0\t0\n+2016\t12\t14\t43\t40\t45.4\t45\t48\t45\t49\t0\t0\t0\t0\t0\t0\t1\n+2016\t2\t6\t49\t53\t49.1\t47\t53\t49\t56\t0\t0\t1\t0\t0\t0\t0\n+2016\t9\t16\t79\t71\t70.7\t70\t74\t71\t52\t1\t0\t0\t0\t0\t0\t0\n+2016\t3\t15\t54\t49\t53.6\t49\t58\t52\t70\t0\t0\t0\t0\t0\t1\t0\n+2016\t11\t12\t64\t63\t51.7\t50\t52\t52\t63\t0\t0\t1\t0\t0\t0\t0\n+2016\t3\t27\t57\t59\t55.3\t52\t58\t55\t39\t0\t0\t0\t1\t0\t0\t0\n+2016\t11\t20\t55\t57\t49.8\t47\t54\t48\t30\t0\t0\t0\t1\t0\t0\t0\n+2016\t10\t18\t60\t60\t58.8\t54\t60\t57\t53\t0\t0\t0\t0\t0\t1\t0\n+2016\t3\t3\t58\t55\t51.8\t49\t54\t50\t71\t0\t0\t0\t0\t1\t0\t0\n+2016\t6\t20\t65\t70\t70.6\t67\t71\t70\t79\t0\t1\t0\t0\t0\t0\t0\n+2016\t7\t16\t77\t76\t76.1\t76\t78\t75\t61\t0\t0\t1\t0\t0\t0\t0\n+2016\t6\t11\t65\t67\t69.0\t69\t72\t71\t87\t0\t0\t1\t0\t0\t0\t0\n+2016\t4\t20\t89\t81\t59.2\t56\t63\t61\t66\t0\t0\t0\t0\t0\t0\t1\n+2016\t9\t6\t68\t68\t73.3\t73\t76\t75\t79\t0\t0\t0\t0\t0\t1\t0\n+2016\t12\t18\t35\t35\t45.2\t44\t46\t46\t36\t0\t0\t0\t1\t0\t0\t0\n+2016\t4\t4\t63\t69\t56.5\t54\t59\t56\t45\t0\t1\t0\t0\t0\t0\t0\n+2016\t10\t10\t68\t57\t61.8\t58\t64\t61\t62\t0\t1\t0\t0\t0\t0\t0\n+2016\t3\t29\t51\t56\t55.6\t53\t59\t54\t45\t0\t0\t0\t0\t0\t1\t0\n+2016\t11\t15\t55\t57\t51.0\t47\t54\t51\t46\t0\t0\t0\t0\t0\t1\t0\n+2016\t1\t12\t52\t45\t46.8\t44\t50\t45\t61\t0\t0\t0\t0\t0\t1\t0\n+2016\t8\t10\t73\t72\t77.0\t77\t78\t77\t68\t0\t0\t0\t0\t0\t0\t1\n+2016\t7\t11\t71\t74\t75.3\t74\t79\t75\t71\t0\t1\t0\t0\t0\t0\t0\n+2016\t9\t8\t68\t67\t72.8\t69\t77\t73\t56\t0\t0\t0\t0\t1\t0\t0\n'
b
diff -r 50b3e080cef0 -r 66df2aa6cd6b test-data/train_test_split_train02.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/train_test_split_train02.tabular Fri Nov 01 17:13:42 2019 -0400
b
b'@@ -0,0 +1,800 @@\n+0.13074624395513548\t-0.469611133626014\t-0.5710665790468505\t0.03279113352421141\t2.003536501469461\t2.3320994929619165\t2.5655773908930333\t-1.8172726174227096\t0.31252740842018656\t0.4183877613375451\t2.3746178626049312\t-0.6930727012865296\t-0.013183556173275029\t1.1098774440423256\t1.4603607557778286\t0.5412632236853618\t0.6061667777690624\t0.4212995019384291\t0.14980350057199987\t-1.3870421561971842\n+0.1025284847407583\t-2.6554352025337806\t-0.71518541502396\t-1.1404916299860086\t1.1910205067228126\t2.113153625179661\t2.9349032443668133\t-1.2362775613386645\t-0.05861748263617049\t-0.12932403608468454\t2.217536166240706\t-1.19338504289619\t0.08517274490563755\t0.8749991601378865\t0.03823939811250166\t0.7007347847223218\t0.6221756436475849\t-1.9582697041316883\t0.1486878915218063\t-1.8828047461722932\n+-0.3579496672192898\t0.5691405803600221\t-0.3135941251222193\t0.6099240993754877\t-0.21623755767016947\t1.2273086492959706\t1.6963625000374265\t0.4917445652599018\t1.51820010664321\t-0.6179648499957521\t0.4424061323382702\t0.37607271963750777\t0.0955642147899332\t1.1428211235733463\t1.3792380662910433\t0.8392247210016273\t-1.3784520073608069\t0.6806565402268875\t-0.4079706906458002\t-1.8670081757072128\n+-0.4813193986666376\t-0.1886485401626124\t0.048734923506973636\t-0.20004930206569047\t-1.0585699644909594\t2.7625383995667336\t1.8863896126660609\t0.8214112065844242\t-0.4384103073465777\t-0.3211449191911812\t2.19052189921114\t-1.59109564541547\t1.3097995624508914\t1.5592201449464334\t-0.3552421947179116\t-0.4128075508328489\t0.5596595170526524\t-1.176294355286121\t0.16888633455190946\t-0.9214884435605952\n+-0.2572336609353247\t0.29438982269850145\t-0.06531975450102831\t1.5097968126742924\t-0.7755962651137243\t2.4354435421606127\t0.38216873050665007\t1.1239051747279731\t-0.2442436451866952\t0.12718619074952095\t0.9026611100653392\t-1.803720014048137\t1.2266258763633622\t0.22899043555447016\t-0.6493009189318991\t0.21750122466449906\t-0.4382663216525586\t-0.2972087114804226\t-1.5229655091814298\t-0.3225053056087868\n+1.4069915349949509\t0.3654232815183534\t-1.097052189453232\t-0.5389149543134537\t-1.5728963747716522\t1.6783401449848374\t0.9288455507296128\t-0.4925716601774063\t1.0392596016586455\t-0.2847157775591438\t0.5210189577500189\t-2.65782453257402\t-1.67318496169606\t0.4719725602155527\t-1.0949050649335628\t0.08630539086516381\t1.016831070562736\t-0.9944516912574556\t-0.6752082767957616\t-1.0649707211089758\n+-0.1186989836246748\t1.7836421698313514\t-0.7750775352454679\t-1.6293416755674714\t-0.6843986506548367\t1.6772721667636452\t5.61626113564464\t0.2921048965669931\t-0.03311146686259204\t-0.20216240643483607\t3.174632106697607\t1.3260918422916352\t-1.4169867073972098\t1.1177286442516994\t1.1442261013773558\t2.2927637054906245\t-1.1696635334539611\t0.9572219962948342\t-0.99260262548243\t-3.88516570201557\n+-1.6188277521439098\t-0.6049258835366146\t-2.1216868938554883\t0.6816156489035747\t-0.3909183237429715\t1.8546492624641897\t3.5484612828339506\t0.8719065415632481\t2.758577973437618\t1.6571275711005302\t2.2964938011707874\t-1.3015552984330785\t0.6517060330634804\t0.5957551049011494\t1.7890274248449136\t-0.7415803218575354\t-0.005766275627966389\t-0.15804411491961362\t0.13620848005420536\t-2.4231894996131182\n+-0.8844255979781576\t-1.067022557954717\t0.4268970268412451\t-0.4792374662006493\t0.8774697010725497\t2.031228226698857\t4.956071644421575\t0.3213541753652649\t-0.8329849287815198\t-2.9127670891791504\t3.303547980533676\t0.6551018446390298\t0.5601240239650124\t1.9378083839436648\t0.6510057852005603\t0.5434997376470951\t-0.16431466813504966\t-1.2255895916041704\t-0.6701271433847471\t-3.1283762290921477\n+-0.30746702969320694\t-0.8391679152764611\t-0.1179283406215597\t-0.426295494661604\t-1.691982298012858\t2.8901125627044437\t2.0602489643699675\t0.9458180233686614\t0.793907788630693\t-1.364580463112297\t2.4726804852199185\t0.8429876604473175\t0.2306659754164001\t2.228388534591572\t0.3261200509781369\t0.23298923486173995\t-1.5934373922813216\t0.3574092709432904\t-1.8018244078785832\t-0.8941426836775552\n+-0.03042402302151745\t0.5533032756826644\t-0.4112837804349074\t-0.8355476515317032\t-0.262'..b'250746598924\t0.18458094061494845\t0.2312626005625568\t0.5086324430299911\t-1.2655949713688883\n+2.4366892748151594\t-0.5693156806025699\t-1.7218141143792118\t-0.7636370379358908\t1.3812428414296332\t0.8734261792585589\t3.6993297964062575\t-0.2510229748899681\t-0.2572996499581653\t1.0939573204735948\t1.4250691293913331\t-0.6234909491978371\t0.8946129186610708\t0.11348850342063865\t-0.8171226347069339\t0.4036243685718015\t1.2492832667321032\t-0.16559924725384395\t0.05010698769682866\t-3.1064820228464267\n+-0.6496553421679686\t-1.4224279723935236\t2.3012734316107286\t-1.6307384651011865\t0.7899921830677415\t1.5784780783388637\t1.5937350935854364\t0.2033287108801172\t0.03485866731366751\t0.6478279768265606\t0.5072168351442272\t-1.6486585166575147\t-0.3823982996033502\t2.3256408720316006\t-0.9273509613624984\t0.6528468905997087\t0.8314107815153837\t1.2344031799078437\t-0.2712026087680339\t-1.7964285078767936\n+1.556971762459764\t-1.2439952121813922\t-0.42294148920420016\t1.2509123545030678\t-0.04525686050637002\t1.8102334072756012\t4.330921368106597\t0.4369341397955197\t1.7090276790490326\t-1.3105903617385728\t2.6507931144960315\t0.9560232948982376\t0.9264898048764156\t1.27342213352265\t-0.1775463778209161\t-0.5020139494158932\t1.0777715747655348\t-1.5004727301982392\t-0.8712982816000493\t-2.9628149579187566\n+0.9217291089973372\t-1.3885863242255478\t0.25423533911482016\t0.1168834752581415\t0.3075246148086876\t2.583752655948304\t1.868779214202141\t-1.5598686552244263\t-0.43742348357135097\t-2.0674552381167857\t2.1004247677315293\t0.592164188729302\t-0.4145039221243959\t0.8609838368049071\t-0.7423945821145248\t1.546996722395656\t0.4044604320792881\t-1.3908367691435546\t-0.19382679005878886\t-0.9316346070105346\n+-0.5219973387100996\t0.9905632425118324\t-1.2688367548190436\t-1.3062113291308677\t1.2638138110709067\t1.8660691408757044\t0.5445787221442651\t1.4116584346018954\t-0.5641770654580077\t-0.3012039021140541\t0.2268327388683611\t-0.8279588610356573\t-0.6522929057307618\t-0.20603677850657687\t-0.135516011514525\t1.0275029807709108\t-0.19718057119851085\t-0.9413847947787156\t0.19608217733319547\t-0.9608113047816084\n+0.4424052465929388\t0.943928936525626\t1.738397490506961\t-0.12161122641383293\t0.15475728725187682\t1.8624246245418483\t3.2762488723359144\t-0.4270106111994435\t0.1528975135659882\t0.4771953229726215\t2.3155774125395427\t1.3689173890211586\t0.7770702960925243\t-1.4296307560984765\t0.7923063752623205\t0.2514409708101872\t1.1840866916876511\t0.8951950393049203\t-0.5737280626680346\t-2.1013927221698583\n+0.7693680917931209\t0.042252199267129815\t0.920578733178434\t1.2609933412881686\t-0.9009957896033098\t3.4649606386186127\t-0.09641604038965236\t-1.4408423082558597\t-1.3370985919131873\t-2.909342960508076\t1.3996034179270973\t1.1071348345938952\t0.6373319894768134\t-0.20576308333152926\t0.5627232979887723\t1.2446890017440848\t0.14542476550535846\t-0.27293462018189524\t-0.08718378360133876\t0.3686229650559225\n+0.7427620511228765\t-1.5580462215214408\t1.4680352994852566\t-0.7508175656670606\t0.6363631862918148\t3.1644775950816646\t1.8594024439897647\t-0.4499136700983101\t0.6875433245937749\t0.4124013786469116\t2.179503463347244\t0.8484523669327337\t-0.546863836293962\t0.17441446341147884\t0.24045384074599194\t-1.228725137426046\t0.7554095521777582\t-0.030134646614598738\t-0.4835932968055189\t-1.021435051734048\n+2.0468935191072437\t-0.7226970302245961\t-0.4839561868483981\t-2.222915078471478\t0.3459880131172701\t1.1324497189504088\t1.4912587172048224\t0.3411839598264167\t0.6715382471375413\t-0.3651029407087692\t0.03233087935168455\t-0.5081627405589572\t0.002075317851864144\t-0.07944497974608919\t-0.13805622601618786\t0.4878193412223996\t-0.3974492638991908\t0.3347669895977678\t0.9512754223441522\t-1.987373538202905\n+-1.785494148707842\t1.3285224891343512\t-0.5279590208716799\t2.675167568819385\t1.5490279490427394\t1.9850254692433156\t-0.4538705494124088\t0.2596309736678987\t0.1769080847916054\t0.2504311940060068\t-0.03754622317067513\t-2.2382627787119773\t0.3799303209778132\t1.027127616405047\t-0.8246136050829563\t0.4127647478763152\t-0.34515534022029715\t0.8158793586435744\t-0.06121611794895705\t-0.11706301505657656\n'
b
diff -r 50b3e080cef0 -r 66df2aa6cd6b test-data/train_test_split_train03.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/train_test_split_train03.tabular Fri Nov 01 17:13:42 2019 -0400
b
b'@@ -0,0 +1,209 @@\n+year\tmonth\tday\ttemp_2\ttemp_1\taverage\tforecast_noaa\tforecast_acc\tforecast_under\tfriend\tweek_Fri\tweek_Mon\tweek_Sat\tweek_Sun\tweek_Thurs\tweek_Tues\tweek_Wed\n+2016\t4\t14\t60\t59\t58.1\t57\t63\t58\t66\t0\t0\t0\t0\t1\t0\t0\n+2016\t7\t30\t85\t88\t77.3\t75\t79\t77\t70\t0\t0\t1\t0\t0\t0\t0\n+2016\t5\t15\t82\t65\t64.7\t63\t69\t64\t58\t0\t0\t0\t1\t0\t0\t0\n+2016\t1\t18\t54\t50\t47.5\t44\t48\t49\t58\t0\t1\t0\t0\t0\t0\t0\n+2016\t11\t25\t49\t52\t48.6\t45\t52\t47\t41\t1\t0\t0\t0\t0\t0\t0\n+2016\t7\t20\t73\t78\t76.7\t75\t78\t77\t66\t0\t0\t0\t0\t0\t0\t1\n+2016\t12\t8\t42\t40\t46.1\t45\t51\t47\t36\t0\t0\t0\t0\t1\t0\t0\n+2016\t12\t28\t42\t47\t45.3\t41\t49\t44\t58\t0\t0\t0\t0\t0\t0\t1\n+2016\t7\t7\t69\t76\t74.4\t73\t77\t74\t72\t0\t0\t0\t0\t1\t0\t0\n+2016\t12\t15\t40\t39\t45.3\t45\t49\t47\t46\t0\t0\t0\t0\t1\t0\t0\n+2016\t5\t31\t64\t71\t67.3\t63\t72\t68\t85\t0\t0\t0\t0\t0\t1\t0\n+2016\t1\t20\t54\t48\t47.7\t44\t52\t49\t61\t0\t0\t0\t0\t0\t0\t1\n+2016\t8\t10\t73\t72\t77.0\t77\t78\t77\t68\t0\t0\t0\t0\t0\t0\t1\n+2016\t3\t23\t56\t57\t54.7\t50\t58\t55\t70\t0\t0\t0\t0\t0\t0\t1\n+2016\t12\t24\t45\t40\t45.1\t44\t47\t46\t39\t0\t0\t1\t0\t0\t0\t0\n+2016\t1\t19\t50\t54\t47.6\t47\t49\t48\t53\t0\t0\t0\t0\t0\t1\t0\n+2016\t11\t6\t65\t58\t53.2\t52\t57\t55\t71\t0\t0\t0\t1\t0\t0\t0\n+2016\t10\t29\t60\t65\t55.3\t55\t59\t55\t65\t0\t0\t1\t0\t0\t0\t0\n+2016\t2\t1\t48\t47\t48.8\t46\t49\t49\t51\t0\t1\t0\t0\t0\t0\t0\n+2016\t12\t12\t44\t44\t45.6\t43\t50\t45\t42\t0\t1\t0\t0\t0\t0\t0\n+2016\t5\t30\t64\t64\t67.1\t64\t70\t66\t69\t0\t1\t0\t0\t0\t0\t0\n+2016\t10\t23\t59\t62\t57.1\t57\t58\t59\t67\t0\t0\t0\t1\t0\t0\t0\n+2016\t9\t30\t68\t66\t65.7\t64\t67\t65\t74\t1\t0\t0\t0\t0\t0\t0\n+2016\t9\t12\t77\t70\t71.8\t67\t73\t73\t90\t0\t1\t0\t0\t0\t0\t0\n+2016\t11\t17\t55\t50\t50.5\t46\t51\t50\t57\t0\t0\t0\t0\t1\t0\t0\n+2016\t3\t3\t58\t55\t51.8\t49\t54\t50\t71\t0\t0\t0\t0\t1\t0\t0\n+2016\t11\t21\t57\t55\t49.5\t46\t51\t49\t67\t0\t1\t0\t0\t0\t0\t0\n+2016\t4\t24\t64\t65\t60.1\t57\t61\t60\t41\t0\t0\t0\t1\t0\t0\t0\n+2016\t5\t20\t64\t63\t65.6\t63\t70\t64\t73\t1\t0\t0\t0\t0\t0\t0\n+2016\t1\t7\t44\t51\t46.2\t45\t49\t46\t38\t0\t0\t0\t0\t1\t0\t0\n+2016\t9\t24\t67\t64\t68.0\t65\t71\t66\t64\t0\t0\t1\t0\t0\t0\t0\n+2016\t8\t30\t79\t75\t74.6\t74\t76\t75\t63\t0\t0\t0\t0\t0\t1\t0\n+2016\t1\t11\t50\t52\t46.7\t42\t48\t48\t39\t0\t1\t0\t0\t0\t0\t0\n+2016\t6\t9\t85\t67\t68.6\t66\t73\t69\t80\t0\t0\t0\t0\t1\t0\t0\n+2016\t9\t22\t67\t68\t68.7\t65\t70\t69\t56\t0\t0\t0\t0\t1\t0\t0\n+2016\t3\t25\t53\t54\t55.0\t53\t57\t57\t42\t1\t0\t0\t0\t0\t0\t0\n+2016\t10\t24\t62\t62\t56.8\t52\t61\t57\t70\t0\t1\t0\t0\t0\t0\t0\n+2016\t7\t16\t77\t76\t76.1\t76\t78\t75\t61\t0\t0\t1\t0\t0\t0\t0\n+2016\t7\t1\t74\t73\t73.1\t71\t75\t72\t93\t1\t0\t0\t0\t0\t0\t0\n+2016\t11\t18\t50\t52\t50.3\t50\t53\t50\t35\t1\t0\t0\t0\t0\t0\t0\n+2016\t9\t3\t75\t70\t73.9\t71\t75\t73\t68\t0\t0\t1\t0\t0\t0\t0\n+2016\t8\t2\t73\t77\t77.4\t75\t80\t79\t62\t0\t0\t0\t0\t0\t1\t0\n+2016\t4\t5\t69\t60\t56.6\t52\t58\t56\t72\t0\t0\t0\t0\t0\t1\t0\n+2016\t3\t13\t55\t52\t53.3\t50\t55\t53\t54\t0\t0\t0\t1\t0\t0\t0\n+2016\t4\t9\t77\t76\t57.2\t53\t61\t57\t74\t0\t0\t1\t0\t0\t0\t0\n+2016\t5\t26\t66\t66\t66.5\t64\t70\t65\t85\t0\t0\t0\t0\t1\t0\t0\n+2016\t10\t10\t68\t57\t61.8\t58\t64\t61\t62\t0\t1\t0\t0\t0\t0\t0\n+2016\t4\t10\t76\t66\t57.4\t57\t60\t57\t60\t0\t0\t0\t1\t0\t0\t0\n+2016\t3\t12\t56\t55\t53.1\t52\t58\t53\t65\t0\t0\t1\t0\t0\t0\t0\n+2016\t1\t24\t57\t48\t48.1\t46\t50\t48\t54\t0\t0\t0\t1\t0\t0\t0\n+2016\t2\t7\t53\t49\t49.2\t46\t51\t48\t63\t0\t0\t0\t1\t0\t0\t0\n+2016\t5\t27\t66\t65\t66.7\t64\t67\t68\t73\t1\t0\t0\t0\t0\t0\t0\n+2016\t3\t11\t55\t56\t53.0\t53\t53\t51\t36\t1\t0\t0\t0\t0\t0\t0\n+2016\t10\t22\t62\t59\t57.4\t56\t59\t58\t44\t0\t0\t1\t0\t0\t0\t0\n+2016\t5\t8\t77\t82\t63.2\t62\t65\t63\t83\t0\t0\t0\t1\t0\t0\t0\n+2016\t5\t29\t64\t64\t67.0\t65\t71\t65\t76\t0\t0\t0\t1\t0\t0\t0\n+2016\t12\t13\t44\t43\t45.5\t41\t47\t46\t46\t0\t0\t0\t0\t0\t1\t0\n+2016\t11\t8\t61\t63\t52.7\t49\t57\t52\t49\t0\t0\t0\t0\t0\t1\t0\n+2016\t6\t20\t65\t70\t70.6\t67\t71\t70\t79\t0\t1\t0\t0\t0\t0\t0\n+2016\t11\t9\t63\t71\t52.4\t48\t56\t52\t42\t0\t0\t0\t0\t0\t0\t1\n+2016\t7\t3\t76\t76\t73.5\t69\t76\t75\t85\t0\t0\t0\t1\t0\t0\t0\n+2016\t12\t16\t39\t39\t45.3\t44\t49\t44\t39\t1\t0\t0\t0\t0\t0\t0\n+2016\t9\t16\t79\t71\t70.7\t70\t74\t71\t52\t1\t0\t0\t0\t0\t0\t0\n+2016\t6\t25\t68\t69\t71.7\t68\t73\t73\t89\t0\t0\t1\t0\t0\t0\t0\n+2016\t9\t13\t70\t74\t71.5\t71\t75\t70\t82\t0\t0\t0\t0\t0\t1\t0\n+2016\t5\t12\t75\t81\t64.1\t62\t67\t63\t81\t0\t0\t0\t0\t1\t0\t0\n+2016\t2\t8\t49\t51\t49.3\t49\t52\t50\t34\t0\t1\t0\t0\t0\t0\t0\n+2016\t7\t4\t76\t71\t73.8\t71\t76\t73\t86\t0\t1\t0\t0\t0\t0\t0\n+2016\t4\t25\t65\t55\t60.3\t56\t64\t61\t77\t0\t1\t0\t0\t0\t0\t0\n+2016\t8\t12\t76\t80\t76.9\t72\t79\t77\t81\t1\t0\t0\t0\t0\t0\t0\n+2016\t9\t21\t71\t67\t69.0\t65\t70\t70\t76\t0\t0\t0\t0\t0\t0\t1\n+2016\t4\t30\t64\t61\t61.4\t60\t65\t62\t78\t0\t0\t1\t0\t0\t0\t0\n+2016\t12\t5\t49\t46\t46.6\t43\t50\t45\t65\t0\t1\t0\t0\t0\t0\t0\n+2016\t12\t19\t35\t39\t45.1\t42\t46\t45\t51\t0\t1\t0\t0\t0\t0\t0\n+2016\t11\t29\t48\t52\t47.8\t43\t48\t47\t50\t0\t0\t0\t0\t0\t1\t0\n+2016\t9\t14\t74\t75\t71.2\t67\t75\t73\t77\t0\t0\t0\t0\t0\t0\t1\n+2016\t9\t6\t68\t68\t73.3\t73\t76\t75\t79\t0\t0\t0\t0\t0\t1\t0\n+2016\t6\t6\t81\t92\t68.2\t65\t70\t67\t71\t0\t1\t0\t0\t0\t0\t0\n+2016\t1\t3\t45\t44\t45.8\t43\t46\t47\t56\t0\t0\t0\t1\t0\t0\t0\n+2016\t4\t28\t60\t61\t61.0\t56\t65\t62\t73\t0\t0'..b'\t52\t63\t0\t0\t1\t0\t0\t0\t0\n+2016\t4\t13\t58\t60\t57.9\t55\t62\t56\t77\t0\t0\t0\t0\t0\t0\t1\n+2016\t8\t23\t84\t81\t75.7\t73\t78\t77\t89\t0\t0\t0\t0\t0\t1\t0\n+2016\t7\t14\t77\t75\t75.8\t74\t76\t77\t77\t0\t0\t0\t0\t1\t0\t0\n+2016\t11\t13\t63\t59\t51.4\t48\t56\t50\t64\t0\t0\t0\t1\t0\t0\t0\n+2016\t8\t9\t72\t73\t77.1\t77\t80\t79\t94\t0\t0\t0\t0\t0\t1\t0\n+2016\t4\t16\t59\t60\t58.5\t56\t60\t59\t59\t0\t0\t1\t0\t0\t0\t0\n+2016\t6\t23\t73\t75\t71.3\t68\t72\t71\t56\t0\t0\t0\t0\t1\t0\t0\n+2016\t4\t11\t66\t59\t57.6\t56\t60\t58\t40\t0\t1\t0\t0\t0\t0\t0\n+2016\t2\t6\t49\t53\t49.1\t47\t53\t49\t56\t0\t0\t1\t0\t0\t0\t0\n+2016\t8\t6\t80\t79\t77.2\t76\t81\t79\t60\t0\t0\t1\t0\t0\t0\t0\n+2016\t3\t5\t59\t57\t52.1\t49\t53\t51\t46\t0\t0\t1\t0\t0\t0\t0\n+2016\t6\t2\t79\t75\t67.6\t64\t71\t67\t77\t0\t0\t0\t0\t1\t0\t0\n+2016\t2\t2\t47\t46\t48.8\t48\t50\t50\t56\t0\t0\t0\t0\t0\t1\t0\n+2016\t7\t22\t82\t81\t76.9\t72\t77\t76\t70\t1\t0\t0\t0\t0\t0\t0\n+2016\t11\t24\t54\t49\t48.9\t47\t53\t48\t29\t0\t0\t0\t0\t1\t0\t0\n+2016\t1\t28\t56\t57\t48.4\t44\t52\t48\t34\t0\t0\t0\t0\t1\t0\t0\n+2016\t10\t18\t60\t60\t58.8\t54\t60\t57\t53\t0\t0\t0\t0\t0\t1\t0\n+2016\t6\t14\t70\t66\t69.5\t66\t71\t69\t85\t0\t0\t0\t0\t0\t1\t0\n+2016\t11\t11\t65\t64\t51.9\t50\t53\t52\t55\t1\t0\t0\t0\t0\t0\t0\n+2016\t3\t6\t57\t64\t52.2\t52\t53\t51\t49\t0\t0\t0\t1\t0\t0\t0\n+2016\t5\t18\t60\t71\t65.2\t61\t68\t65\t56\t0\t0\t0\t0\t0\t0\t1\n+2016\t5\t11\t67\t75\t63.8\t62\t68\t63\t60\t0\t0\t0\t0\t0\t0\t1\n+2016\t3\t8\t60\t53\t52.5\t48\t56\t51\t70\t0\t0\t0\t0\t0\t1\t0\n+2016\t1\t15\t55\t49\t47.1\t46\t51\t46\t65\t1\t0\t0\t0\t0\t0\t0\n+2016\t6\t8\t86\t85\t68.5\t67\t70\t69\t81\t0\t0\t0\t0\t0\t0\t1\n+2016\t2\t10\t57\t62\t49.4\t48\t50\t49\t30\t0\t0\t0\t0\t0\t0\t1\n+2016\t12\t3\t46\t50\t47.0\t42\t52\t47\t58\t0\t0\t1\t0\t0\t0\t0\n+2016\t10\t27\t65\t58\t55.9\t51\t60\t55\t39\t0\t0\t0\t0\t1\t0\t0\n+2016\t8\t7\t79\t72\t77.2\t74\t78\t77\t95\t0\t0\t0\t1\t0\t0\t0\n+2016\t11\t16\t57\t55\t50.7\t50\t51\t49\t34\t0\t0\t0\t0\t0\t0\t1\n+2016\t9\t10\t72\t74\t72.3\t70\t77\t74\t91\t0\t0\t1\t0\t0\t0\t0\n+2016\t7\t29\t83\t85\t77.3\t77\t80\t79\t77\t1\t0\t0\t0\t0\t0\t0\n+2016\t12\t1\t52\t52\t47.4\t44\t48\t49\t39\t0\t0\t0\t0\t1\t0\t0\n+2016\t9\t25\t64\t67\t67.6\t64\t72\t67\t62\t0\t0\t0\t1\t0\t0\t0\n+2016\t12\t23\t49\t45\t45.1\t45\t49\t44\t35\t1\t0\t0\t0\t0\t0\t0\n+2016\t12\t2\t52\t46\t47.2\t46\t51\t49\t41\t1\t0\t0\t0\t0\t0\t0\n+2016\t10\t13\t62\t66\t60.6\t60\t62\t60\t57\t0\t0\t0\t0\t1\t0\t0\n+2016\t7\t23\t81\t71\t77.0\t75\t81\t76\t86\t0\t0\t1\t0\t0\t0\t0\n+2016\t6\t13\t65\t70\t69.3\t66\t72\t69\t79\t0\t1\t0\t0\t0\t0\t0\n+2016\t2\t15\t55\t58\t49.9\t46\t52\t49\t53\t0\t1\t0\t0\t0\t0\t0\n+2016\t8\t8\t72\t72\t77.1\t76\t78\t77\t65\t0\t1\t0\t0\t0\t0\t0\n+2016\t7\t12\t74\t74\t75.4\t74\t77\t77\t71\t0\t0\t0\t0\t0\t1\t0\n+2016\t10\t3\t63\t65\t64.5\t63\t68\t65\t49\t0\t1\t0\t0\t0\t0\t0\n+2016\t4\t18\t68\t77\t58.8\t55\t59\t57\t39\t0\t1\t0\t0\t0\t0\t0\n+2016\t2\t25\t60\t59\t50.9\t49\t51\t49\t35\t0\t0\t0\t0\t1\t0\t0\n+2016\t1\t2\t44\t45\t45.7\t41\t50\t44\t61\t0\t0\t1\t0\t0\t0\t0\n+2016\t2\t21\t51\t53\t50.5\t49\t54\t52\t46\t0\t0\t0\t1\t0\t0\t0\n+2016\t3\t24\t57\t53\t54.9\t54\t56\t56\t72\t0\t0\t0\t0\t1\t0\t0\n+2016\t7\t27\t85\t79\t77.3\t73\t78\t79\t79\t0\t0\t0\t0\t0\t0\t1\n+2016\t2\t4\t51\t49\t49.0\t44\t54\t51\t44\t0\t0\t0\t0\t1\t0\t0\n+2016\t4\t4\t63\t69\t56.5\t54\t59\t56\t45\t0\t1\t0\t0\t0\t0\t0\n+2016\t2\t24\t51\t60\t50.8\t47\t53\t50\t46\t0\t0\t0\t0\t0\t0\t1\n+2016\t10\t8\t63\t64\t62.5\t60\t65\t61\t73\t0\t0\t1\t0\t0\t0\t0\n+2016\t9\t15\t75\t79\t71.0\t66\t76\t69\t64\t0\t0\t0\t0\t1\t0\t0\n+2016\t1\t14\t49\t55\t47.0\t43\t47\t46\t58\t0\t0\t0\t0\t1\t0\t0\n+2016\t4\t1\t68\t73\t56.0\t54\t59\t55\t41\t1\t0\t0\t0\t0\t0\t0\n+2016\t5\t17\t57\t60\t65.0\t62\t65\t65\t55\t0\t0\t0\t0\t0\t1\t0\n+2016\t12\t18\t35\t35\t45.2\t44\t46\t46\t36\t0\t0\t0\t1\t0\t0\t0\n+2016\t9\t17\t71\t75\t70.3\t66\t73\t70\t84\t0\t0\t1\t0\t0\t0\t0\n+2016\t2\t26\t59\t61\t51.1\t48\t56\t53\t65\t1\t0\t0\t0\t0\t0\t0\n+2016\t12\t30\t48\t48\t45.4\t44\t46\t44\t42\t1\t0\t0\t0\t0\t0\t0\n+2016\t7\t9\t68\t74\t74.9\t70\t79\t76\t60\t0\t0\t1\t0\t0\t0\t0\n+2016\t2\t20\t53\t51\t50.4\t48\t55\t51\t43\t0\t0\t1\t0\t0\t0\t0\n+2016\t9\t9\t67\t72\t72.6\t68\t77\t71\t78\t1\t0\t0\t0\t0\t0\t0\n+2016\t9\t26\t67\t76\t67.2\t64\t69\t69\t74\t0\t1\t0\t0\t0\t0\t0\n+2016\t1\t22\t52\t52\t47.9\t47\t48\t48\t60\t1\t0\t0\t0\t0\t0\t0\n+2016\t11\t27\t52\t53\t48.2\t48\t49\t49\t53\t0\t0\t0\t1\t0\t0\t0\n+2016\t10\t20\t61\t58\t58.1\t58\t59\t58\t43\t0\t0\t0\t0\t1\t0\t0\n+2016\t7\t13\t74\t77\t75.6\t74\t78\t76\t56\t0\t0\t0\t0\t0\t0\t1\n+2016\t11\t7\t58\t61\t52.9\t51\t56\t51\t35\t0\t1\t0\t0\t0\t0\t0\n+2016\t10\t1\t66\t67\t65.3\t64\t70\t64\t54\t0\t0\t1\t0\t0\t0\t0\n+2016\t11\t22\t55\t54\t49.3\t46\t54\t49\t58\t0\t0\t0\t0\t0\t1\t0\n+2016\t6\t1\t71\t79\t67.4\t65\t69\t66\t58\t0\t0\t0\t0\t0\t0\t1\n+2016\t6\t3\t75\t71\t67.7\t64\t71\t66\t55\t1\t0\t0\t0\t0\t0\t0\n+2016\t3\t31\t64\t68\t55.9\t55\t59\t56\t56\t0\t0\t0\t0\t1\t0\t0\n+2016\t12\t14\t43\t40\t45.4\t45\t48\t45\t49\t0\t0\t0\t0\t0\t0\t1\n+2016\t8\t5\t75\t80\t77.3\t75\t81\t78\t71\t1\t0\t0\t0\t0\t0\t0\n+2016\t5\t4\t87\t74\t62.3\t59\t65\t64\t61\t0\t0\t0\t0\t0\t0\t1\n+2016\t12\t31\t48\t57\t45.5\t42\t48\t47\t57\t0\t0\t1\t0\t0\t0\t0\n+2016\t1\t21\t48\t52\t47.8\t43\t51\t46\t57\t0\t0\t0\t0\t1\t0\t0\n+2016\t7\t10\t74\t71\t75.1\t71\t77\t76\t95\t0\t0\t0\t1\t0\t0\t0\n+2016\t3\t15\t54\t49\t53.6\t49\t58\t52\t70\t0\t0\t0\t0\t0\t1\t0\n+2016\t4\t19\t77\t89\t59.0\t59\t63\t59\t61\t0\t0\t0\t0\t0\t1\t0\n'
b
diff -r 50b3e080cef0 -r 66df2aa6cd6b train_test_split.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/train_test_split.py Fri Nov 01 17:13:42 2019 -0400
[
@@ -0,0 +1,154 @@
+import argparse
+import json
+import pandas as pd
+import warnings
+
+from galaxy_ml.model_validations import train_test_split
+from galaxy_ml.utils import get_cv, read_columns
+
+
+def _get_single_cv_split(params, array, infile_labels=None,
+                         infile_groups=None):
+    """ output (train, test) subset from a cv splitter
+
+    Parameters
+    ----------
+    params : dict
+        Galaxy tool inputs
+    array : pandas DataFrame object
+        The target dataset to split
+    infile_labels : str
+        File path to dataset containing target values
+    infile_groups : str
+        File path to dataset containing group values
+    """
+    y = None
+    groups = None
+
+    nth_split = params['mode_selection']['nth_split']
+
+    # read groups
+    if infile_groups:
+        header = 'infer' if (params['mode_selection']['cv_selector']
+                             ['groups_selector']['header_g']) else None
+        column_option = (params['mode_selection']['cv_selector']
+                         ['groups_selector']['column_selector_options_g']
+                         ['selected_column_selector_option_g'])
+        if column_option in ['by_index_number', 'all_but_by_index_number',
+                             'by_header_name', 'all_but_by_header_name']:
+            c = (params['mode_selection']['cv_selector']['groups_selector']
+                 ['column_selector_options_g']['col_g'])
+        else:
+            c = None
+
+        groups = read_columns(infile_groups, c=c, c_option=column_option,
+                              sep='\t', header=header, parse_dates=True)
+        groups = groups.ravel()
+
+        params['mode_selection']['cv_selector']['groups_selector'] = groups
+
+    # read labels
+    if infile_labels:
+        target_input = (params['mode_selection']
+                        ['cv_selector'].pop('target_input'))
+        header = 'infer' if target_input['header1'] else None
+        col_index = target_input['col'][0] - 1
+        df = pd.read_csv(infile_labels, sep='\t', header=header,
+                         parse_dates=True)
+        y = df.iloc[:, col_index].values
+
+    # construct the cv splitter object
+    splitter, groups = get_cv(params['mode_selection']['cv_selector'])
+
+    total_n_splits = splitter.get_n_splits(array.values, y=y, groups=groups)
+    if nth_split > total_n_splits:
+        raise ValueError("Total number of splits is {}, but got `nth_split` "
+                         "= {}".format(total_n_splits, nth_split))
+
+    i = 1
+    for train_index, test_index in splitter.split(array.values, y=y, groups=groups):
+        # suppose nth_split >= 1
+        if i == nth_split:
+            break
+        else:
+            i += 1
+
+    train = array.iloc[train_index, :]
+    test = array.iloc[test_index, :]
+
+    return train, test
+
+
+def main(inputs, infile_array, outfile_train, outfile_test,
+         infile_labels=None, infile_groups=None):
+    """
+    Parameter
+    ---------
+    inputs : str
+        File path to galaxy tool parameter
+
+    infile_array : str
+        File paths of input arrays separated by comma
+
+    infile_labels : str
+        File path to dataset containing labels
+
+    infile_groups : str
+        File path to dataset containing groups
+
+    outfile_train : str
+        File path to dataset containing train split
+
+    outfile_test : str
+        File path to dataset containing test split
+    """
+    warnings.simplefilter('ignore')
+
+    with open(inputs, 'r') as param_handler:
+        params = json.load(param_handler)
+
+    input_header = params['header0']
+    header = 'infer' if input_header else None
+    array = pd.read_csv(infile_array, sep='\t', header=header,
+                        parse_dates=True)
+
+    # train test split
+    if params['mode_selection']['selected_mode'] == 'train_test_split':
+        options = params['mode_selection']['options']
+        shuffle_selection = options.pop('shuffle_selection')
+        options['shuffle'] = shuffle_selection['shuffle']
+        if infile_labels:
+            header = 'infer' if shuffle_selection['header1'] else None
+            col_index = shuffle_selection['col'][0] - 1
+            df = pd.read_csv(infile_labels, sep='\t', header=header,
+                             parse_dates=True)
+            labels = df.iloc[:, col_index].values
+            options['labels'] = labels
+
+        train, test = train_test_split(array, **options)
+
+    # cv splitter
+    else:
+        train, test = _get_single_cv_split(params, array,
+                                           infile_labels=infile_labels,
+                                           infile_groups=infile_groups)
+
+    print("Input shape: %s" % repr(array.shape))
+    print("Train shape: %s" % repr(train.shape))
+    print("Test shape: %s" % repr(test.shape))
+    train.to_csv(outfile_train, sep='\t', header=input_header, index=False)
+    test.to_csv(outfile_test, sep='\t', header=input_header, index=False)
+
+
+if __name__ == '__main__':
+    aparser = argparse.ArgumentParser()
+    aparser.add_argument("-i", "--inputs", dest="inputs", required=True)
+    aparser.add_argument("-X", "--infile_array", dest="infile_array")
+    aparser.add_argument("-y", "--infile_labels", dest="infile_labels")
+    aparser.add_argument("-g", "--infile_groups", dest="infile_groups")
+    aparser.add_argument("-o", "--outfile_train", dest="outfile_train")
+    aparser.add_argument("-t", "--outfile_test", dest="outfile_test")
+    args = aparser.parse_args()
+
+    main(args.inputs, args.infile_array, args.outfile_train,
+         args.outfile_test, args.infile_labels, args.infile_groups)