Mercurial > repos > nml > biohansel_bionumeric_converter
changeset 1:07dfb8fd47f4 draft default tip
planemo upload commit e0d65bf0850ce95ffb89982e61f2136fcf0359ee
| author | nml | 
|---|---|
| date | Mon, 13 May 2019 12:59:15 -0400 | 
| parents | b000a3130db8 | 
| children | |
| files | bionumeric_convert.xml bionumeric_converter.py test-data/Biohansel_Bionumerics.csv test-data/Output.csv test-data/results.tab | 
| diffstat | 5 files changed, 37 insertions(+), 33 deletions(-) [+] | 
line wrap: on
 line diff
--- a/bionumeric_convert.xml Mon Mar 18 13:15:57 2019 -0400 +++ b/bionumeric_convert.xml Mon May 13 12:59:15 2019 -0400 @@ -1,38 +1,41 @@ -<tool id="bionumeric_convert" name="biohansel2bionumerics" version="0.1.0"> +<tool id="bionumeric_convert" name="biohansel2bionumerics" version="0.2.0"> <description>compliant results</description> <requirements> <requirement type="package" version="0.24.1">pandas</requirement> </requirements> <command detect_errors="exit_code"><![CDATA[ - $__tool_directory__/bionumeric_converter.py -f '$Input' -o '$output' + $__tool_directory__/bionumeric_converter.py -f '$Input' -o '$Output' ]]></command> <inputs> <param type="data" name="Input" format="tabular"/> </inputs> <outputs> - <data name="output" format="csv" from_work_dir="output" label="Output.csv"/> + <data name="Output" format="csv" from_work_dir="output" label="Biohansel_Bionumerics"/> </outputs> <tests> <test> <param name="Input" value="results.tab"/> - <output name="output" value="Output.csv"/> + <output name="Output" value="Biohansel_Bionumerics.csv"/> </test> </tests> <help><![CDATA[ **What it does** - This tool is a supplementary script that takes *only* BioHansel output data and converts it into a format compatible with bionumerics. + This tool is a supplementary script that takes Biohansel output data and converts it into a format compatible with Bionumerics. - **How to run it** + **Inputs:** + + - *Individual* output or *Collection* of outputs for any of the three Biohansel results files (tech_results.tab, match_results.tab, or results.tab) - 1. Input any of your BioHansel output files (tech_results.tab, match_results.tab, and results.tab) - 2. Click Execute + **Outputs:** - **Specific modifications done on the data** + - A .CSV file or a collection of .CSV files called "*Output*" that can be renamed and downloaded as required. + + **Specific modifications done to the data** 1. Converts all commas in the output to "/" - 2. Shortens BioHansel qc_messages if they are over 150 characters - 3. Converts the .tab file to a .csv file + 2. Splits Biohansel qc_message column into multiple columns if the message is longer than 150 characters + 3. Converts the .tab or .tsv file to a .csv file ]]></help> <citations>
--- a/bionumeric_converter.py Mon Mar 18 13:15:57 2019 -0400 +++ b/bionumeric_converter.py Mon May 13 12:59:15 2019 -0400 @@ -14,7 +14,7 @@ '-f', '--filename', required=True, - help='Specify your tsv input') + help='Specify your biohansel tsv or other tabular separated input') parser.add_argument( '-o', '--output', @@ -24,30 +24,27 @@ tsv_file = args.filename out_name = args.output - no_comma_tsv = comma_remover(tsv_file) - df = qc_shortener(no_comma_tsv) - df.to_csv(out_name, index=False) - -# Remove comma function: - + df_input = pd.read_csv(tsv_file, sep='\t') -def comma_remover(tsv_file): - # Create a table from the tsv file as an input into the dataframe. - df = pd.read_csv(tsv_file, sep='\t') - # Change all commas to / in the QC message - no_comma_tsv = df.replace(',', '/', regex=True) - return no_comma_tsv + df_no_comma = df_input.replace(',', '/', regex=True) + df = qc_shortener(df_no_comma) + df.to_csv(out_name, index=False) # Shorten QC results: +def splittingstrings(string, length): + return (string[0+i:length+i] for i in range(0, len(string), length)) + + def qc_shortener(df): - for count in df.index: - message = str(df.at[count, 'qc_message']) + for i, row in df.iterrows(): + message = str(row['qc_message']) if len(message) > 150: - results = message.find('|') - new_message = "Truncated after first '|' : " + message[0:results] - df['qc_message'] = df['qc_message'].replace(message, new_message) + message_list = list(splittingstrings(message, 150)) + df.at[i, 'qc_message'] = message_list[0] + for val in range(1, len(message_list)): + df.at[i, 'qc_message_{}'.format(val)] = message_list[val] return df
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/Biohansel_Bionumerics.csv Mon May 13 12:59:15 2019 -0400 @@ -0,0 +1,4 @@ +sample,subtype,avg_tile_coverage,qc_status,qc_message,qc_message_1 +SRR1645238,1.3,43.345,PASS,, +SRR1753252,1.1,32.33,PASS,FAIL: This is a test of the cut off system. The data is good and as such I have to manually type this message in to get it to cut off. I am adding in ,5 comas ///// +SRR1928313,1.1.1,555.11,PASS,,
--- a/test-data/Output.csv Mon Mar 18 13:15:57 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -sample,scheme,scheme_version,subtype,all_subtypes,tiles_matching_subtype,are_subtypes_consistent,inconsistent_subtypes,n_tiles_matching_all,n_tiles_matching_all_expected,n_tiles_matching_positive,n_tiles_matching_positive_expected,n_tiles_matching_subtype,n_tiles_matching_subtype_expected,file_path,avg_tile_coverage,qc_status,qc_message -2019C-111,heidelberg,0.5.0,2.2.3.1.2,2; 2.2; 2.2.3; 2.2.3.1; 2.2.3.1.2,2.2.3.1.2,True,,202,202,14,14,3,3,['2019C-111_1.fastq'/ '2019C-111_2.fastq'],30.07,PASS,Truncated after first '|' : This is a trial to the cut /off/ system as this data all passed the checks.
--- a/test-data/results.tab Mon Mar 18 13:15:57 2019 -0400 +++ b/test-data/results.tab Mon May 13 12:59:15 2019 -0400 @@ -1,2 +1,4 @@ -sample scheme scheme_version subtype all_subtypes tiles_matching_subtype are_subtypes_consistent inconsistent_subtypes n_tiles_matching_all n_tiles_matching_all_expected n_tiles_matching_positive n_tiles_matching_positive_expected n_tiles_matching_subtype n_tiles_matching_subtype_expected file_path avg_tile_coverage qc_status qc_message -2019C-111 heidelberg 0.5.0 2.2.3.1.2 2; 2.2; 2.2.3; 2.2.3.1; 2.2.3.1.2 2.2.3.1.2 True 202 202 14 14 3 3 ['2019C-111_1.fastq', '2019C-111_2.fastq'] 30.070 PASS This is a trial to the cut ,off, system as this data all passed the checks. | I will attemp to get 150 characters into here in a way that is not awful and sounds decent. We can try counting the letters and as of now, it should be ok! +sample subtype avg_tile_coverage qc_status qc_message +SRR1645238 1.3 43.345 PASS +SRR1753252 1.1 32.33 PASS "FAIL: This is a test of the cut off system. The data is good and as such I have to manually type this message in to get it to cut off. I am adding in 5 comas ,,,,," +SRR1928313 1.1.1 555.11 PASS
