Mercurial > repos > bgruening > sklearn_model_validation
annotate train_test_split.py @ 40:e06ab1e112cc draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 57a0433defa3cbc37ab34fbb0ebcfaeb680db8d5
author | bgruening |
---|---|
date | Sun, 05 Nov 2023 15:56:52 +0000 |
parents | 1fe00785190d |
children |
rev | line source |
---|---|
22
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
1 import argparse |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
2 import json |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
3 import warnings |
34
1fe00785190d
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 9981e25b00de29ed881b2229a173a8c812ded9bb
bgruening
parents:
30
diff
changeset
|
4 from distutils.version import LooseVersion as Version |
22
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
5 |
29
de360b57a5ab
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 208a8d348e7c7a182cfbe1b6f17868146428a7e2"
bgruening
parents:
28
diff
changeset
|
6 import pandas as pd |
34
1fe00785190d
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 9981e25b00de29ed881b2229a173a8c812ded9bb
bgruening
parents:
30
diff
changeset
|
7 from galaxy_ml import __version__ as galaxy_ml_version |
22
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
8 from galaxy_ml.model_validations import train_test_split |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
9 from galaxy_ml.utils import get_cv, read_columns |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
10 |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
11 |
28
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
12 def _get_single_cv_split(params, array, infile_labels=None, infile_groups=None): |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
13 """output (train, test) subset from a cv splitter |
22
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
14 |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
15 Parameters |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
16 ---------- |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
17 params : dict |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
18 Galaxy tool inputs |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
19 array : pandas DataFrame object |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
20 The target dataset to split |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
21 infile_labels : str |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
22 File path to dataset containing target values |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
23 infile_groups : str |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
24 File path to dataset containing group values |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
25 """ |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
26 y = None |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
27 groups = None |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
28 |
28
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
29 nth_split = params["mode_selection"]["nth_split"] |
22
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
30 |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
31 # read groups |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
32 if infile_groups: |
30
4b359039f09f
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
29
diff
changeset
|
33 header = ( |
4b359039f09f
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
29
diff
changeset
|
34 "infer" |
4b359039f09f
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
29
diff
changeset
|
35 if (params["mode_selection"]["cv_selector"]["groups_selector"]["header_g"]) |
4b359039f09f
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
29
diff
changeset
|
36 else None |
4b359039f09f
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
29
diff
changeset
|
37 ) |
4b359039f09f
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
29
diff
changeset
|
38 column_option = params["mode_selection"]["cv_selector"]["groups_selector"][ |
4b359039f09f
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
29
diff
changeset
|
39 "column_selector_options_g" |
4b359039f09f
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
29
diff
changeset
|
40 ]["selected_column_selector_option_g"] |
28
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
41 if column_option in [ |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
42 "by_index_number", |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
43 "all_but_by_index_number", |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
44 "by_header_name", |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
45 "all_but_by_header_name", |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
46 ]: |
30
4b359039f09f
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
29
diff
changeset
|
47 c = params["mode_selection"]["cv_selector"]["groups_selector"][ |
4b359039f09f
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
29
diff
changeset
|
48 "column_selector_options_g" |
4b359039f09f
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
29
diff
changeset
|
49 ]["col_g"] |
22
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
50 else: |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
51 c = None |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
52 |
28
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
53 groups = read_columns( |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
54 infile_groups, |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
55 c=c, |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
56 c_option=column_option, |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
57 sep="\t", |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
58 header=header, |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
59 parse_dates=True, |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
60 ) |
22
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
61 groups = groups.ravel() |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
62 |
28
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
63 params["mode_selection"]["cv_selector"]["groups_selector"] = groups |
22
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
64 |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
65 # read labels |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
66 if infile_labels: |
28
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
67 target_input = params["mode_selection"]["cv_selector"].pop("target_input") |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
68 header = "infer" if target_input["header1"] else None |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
69 col_index = target_input["col"][0] - 1 |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
70 df = pd.read_csv(infile_labels, sep="\t", header=header, parse_dates=True) |
22
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
71 y = df.iloc[:, col_index].values |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
72 |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
73 # construct the cv splitter object |
34
1fe00785190d
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 9981e25b00de29ed881b2229a173a8c812ded9bb
bgruening
parents:
30
diff
changeset
|
74 cv_selector = params["mode_selection"]["cv_selector"] |
1fe00785190d
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 9981e25b00de29ed881b2229a173a8c812ded9bb
bgruening
parents:
30
diff
changeset
|
75 if Version(galaxy_ml_version) < Version("0.8.3"): |
1fe00785190d
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 9981e25b00de29ed881b2229a173a8c812ded9bb
bgruening
parents:
30
diff
changeset
|
76 cv_selector.pop("n_stratification_bins", None) |
1fe00785190d
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 9981e25b00de29ed881b2229a173a8c812ded9bb
bgruening
parents:
30
diff
changeset
|
77 splitter, groups = get_cv(cv_selector) |
22
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
78 |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
79 total_n_splits = splitter.get_n_splits(array.values, y=y, groups=groups) |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
80 if nth_split > total_n_splits: |
30
4b359039f09f
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
29
diff
changeset
|
81 raise ValueError( |
4b359039f09f
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
29
diff
changeset
|
82 "Total number of splits is {}, but got `nth_split` " |
4b359039f09f
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
29
diff
changeset
|
83 "= {}".format(total_n_splits, nth_split) |
4b359039f09f
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
29
diff
changeset
|
84 ) |
22
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
85 |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
86 i = 1 |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
87 for train_index, test_index in splitter.split(array.values, y=y, groups=groups): |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
88 # suppose nth_split >= 1 |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
89 if i == nth_split: |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
90 break |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
91 else: |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
92 i += 1 |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
93 |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
94 train = array.iloc[train_index, :] |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
95 test = array.iloc[test_index, :] |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
96 |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
97 return train, test |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
98 |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
99 |
28
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
100 def main( |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
101 inputs, |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
102 infile_array, |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
103 outfile_train, |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
104 outfile_test, |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
105 infile_labels=None, |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
106 infile_groups=None, |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
107 ): |
22
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
108 """ |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
109 Parameter |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
110 --------- |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
111 inputs : str |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
112 File path to galaxy tool parameter |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
113 |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
114 infile_array : str |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
115 File paths of input arrays separated by comma |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
116 |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
117 infile_labels : str |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
118 File path to dataset containing labels |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
119 |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
120 infile_groups : str |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
121 File path to dataset containing groups |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
122 |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
123 outfile_train : str |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
124 File path to dataset containing train split |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
125 |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
126 outfile_test : str |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
127 File path to dataset containing test split |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
128 """ |
28
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
129 warnings.simplefilter("ignore") |
22
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
130 |
28
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
131 with open(inputs, "r") as param_handler: |
22
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
132 params = json.load(param_handler) |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
133 |
28
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
134 input_header = params["header0"] |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
135 header = "infer" if input_header else None |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
136 array = pd.read_csv(infile_array, sep="\t", header=header, parse_dates=True) |
22
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
137 |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
138 # train test split |
28
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
139 if params["mode_selection"]["selected_mode"] == "train_test_split": |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
140 options = params["mode_selection"]["options"] |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
141 shuffle_selection = options.pop("shuffle_selection") |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
142 options["shuffle"] = shuffle_selection["shuffle"] |
22
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
143 if infile_labels: |
28
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
144 header = "infer" if shuffle_selection["header1"] else None |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
145 col_index = shuffle_selection["col"][0] - 1 |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
146 df = pd.read_csv(infile_labels, sep="\t", header=header, parse_dates=True) |
22
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
147 labels = df.iloc[:, col_index].values |
28
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
148 options["labels"] = labels |
22
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
149 |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
150 train, test = train_test_split(array, **options) |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
151 |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
152 # cv splitter |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
153 else: |
30
4b359039f09f
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
29
diff
changeset
|
154 train, test = _get_single_cv_split( |
4b359039f09f
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
29
diff
changeset
|
155 params, array, infile_labels=infile_labels, infile_groups=infile_groups |
4b359039f09f
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
29
diff
changeset
|
156 ) |
22
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
157 |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
158 print("Input shape: %s" % repr(array.shape)) |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
159 print("Train shape: %s" % repr(train.shape)) |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
160 print("Test shape: %s" % repr(test.shape)) |
28
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
161 train.to_csv(outfile_train, sep="\t", header=input_header, index=False) |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
162 test.to_csv(outfile_test, sep="\t", header=input_header, index=False) |
22
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
163 |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
164 |
28
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
165 if __name__ == "__main__": |
22
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
166 aparser = argparse.ArgumentParser() |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
167 aparser.add_argument("-i", "--inputs", dest="inputs", required=True) |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
168 aparser.add_argument("-X", "--infile_array", dest="infile_array") |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
169 aparser.add_argument("-y", "--infile_labels", dest="infile_labels") |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
170 aparser.add_argument("-g", "--infile_groups", dest="infile_groups") |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
171 aparser.add_argument("-o", "--outfile_train", dest="outfile_train") |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
172 aparser.add_argument("-t", "--outfile_test", dest="outfile_test") |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
173 args = aparser.parse_args() |
95b04e086576
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
174 |
28
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
175 main( |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
176 args.inputs, |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
177 args.infile_array, |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
178 args.outfile_train, |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
179 args.outfile_test, |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
180 args.infile_labels, |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
181 args.infile_groups, |
9b017b0da56e
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
22
diff
changeset
|
182 ) |