changeset 4:a580e700aac3 draft

Uploaded
author triasteran
date Tue, 21 Jun 2022 08:32:44 +0000
parents d27375bc4a1c
children e370df93715d
files UMI_riboseq_processing/UMI.py UMI_riboseq_processing/UMI_riboseq.xml
diffstat 2 files changed, 26 insertions(+), 26 deletions(-) [+]
line wrap: on
line diff
--- a/UMI_riboseq_processing/UMI.py	Mon Jun 20 08:02:35 2022 +0000
+++ b/UMI_riboseq_processing/UMI.py	Tue Jun 21 08:32:44 2022 +0000
@@ -1,48 +1,46 @@
-import itertools
+import gzip
+from mimetypes import guess_type
+from functools import partial
+from Bio import SeqIO
 from sys import argv, exit
-from itertools import zip_longest
-
-def grouper(iterable, n, fillvalue=None):
-    args = [iter(iterable)] * n
-    return zip_longest(*args, fillvalue=fillvalue)
-
 
-chunk_size=4
-
-
-def trimandpaste(pathToFastaFile, output):
-    #filename = pathToFastaFile.split('/')[-1]
-    output = open(output,"w")
-    with open(pathToFastaFile) as f:
-        for lines in grouper(f, chunk_size, ""): #for every chunk_sized chunk
+def copy_UMI_to_header_and_output_trimmed_read(pathToFastaFile, output):
+    # find wheather its plain or gzipped fastq
+    encoding = guess_type(pathToFastaFile)[1]  # uses file extension
+    _open = partial(gzip.open, mode='rt') if encoding == 'gzip' else open
+    # output file will be in gz format
+    output = gzip.open(output,"wt")
+    # open and parse
+    with _open(pathToFastaFile) as f:
+        for record in SeqIO.parse(f, 'fastq'):
+            lines = record.format('fastq').split('\n') # list of each record: id, seq, '+', quality
             header = lines[0]
             seq = lines[1]
             sep = lines[2]
             qual = lines[3]
             trimmed_seq = seq[2:-6]+"\n" # fooprint + barcode
-            UMI = seq[0:2]+seq.rstrip()[-5:] #7nt in total; 5'NN and last 3'NNNNN  
+            UMI = seq[0:2]+seq.rstrip()[-5:] #7nt in total; 5'NN and last 3'NNNNN
             split_header = header.split(" ")
             new_header = split_header[0]+"_"+UMI+" "+split_header[1]
             if qual[-1:] == "\n":
                 new_qual = qual[2:-6]+"\n"
             else:
                 new_qual = qual[2:-6]
-            output.write(new_header)
-            output.write(trimmed_seq) 
-            output.write(sep) 
-            output.write(new_qual)
+            output.write(new_header+'\n')
+            output.write(trimmed_seq)
+            output.write(sep+'\n')
+            output.write(new_qual+'\n')
 
-    output.close() 
+    output.close()
 
 def main():
-    if len(argv) != 3: 
+    if len(argv) != 3:
         exit("Usage: 2 arguments required\n1: Path to fasta file \n2: name of output file")
 
     # Get paths
     pathToFastaFile = argv[1]
     output = argv[2]
-        
-    trimandpaste(pathToFastaFile, output)
+    copy_UMI_to_header_and_output_trimmed_read(pathToFastaFile, output)
 
 if __name__ == "__main__":
     main()
--- a/UMI_riboseq_processing/UMI_riboseq.xml	Mon Jun 20 08:02:35 2022 +0000
+++ b/UMI_riboseq_processing/UMI_riboseq.xml	Tue Jun 21 08:32:44 2022 +0000
@@ -1,5 +1,6 @@
-<tool id="UMI_riboseq" name="move UMIs from reads to header" version="0.1.3">
+<tool id="UMI_riboseq" name="move UMIs from reads to header" version="0.1.4">
 <requirements>
+  <requirement type="package" version="1.75">biopython</requirement>
 </requirements>
 <command detect_errors="exit_code">
 <![CDATA[ python3 '$__tool_directory__/UMI.py' $reads $output ]]>
@@ -17,7 +18,8 @@
     </test>
 </tests>
 <help>
-<![CDATA[ **fastq/fastq.gz** input files with reads containing both UMIs and barcodes but NO adapters. ----- **Output** The output of the script is fastq/fastq.gz file where UMIs (7nt in total, 5'NN and 3'NNNNN preceding barcode consisting of 5nt) are in header of the read; next step of processing is demultiplexing]]>
+<![CDATA[ fastq/fastq.gz are input files with reads containing UMIs (already demultiplexed and adapters are removed).
+          The output of the script is fastq/fastq.gz file where UMIs (7nt in total, 5'NN and 3'NNNNN preceding barcode consisting of 5nt) are in header of the read. It is designed for protocol: McGlincy NJ, Ingolia NT. Transcriptome-wide measurement of translation by ribosome profiling. Methods. 2017;126:112-129. doi:10.1016/j.ymeth.2017.05.028 ]]>
 </help>
 <citations>
 <citation type="bibtex"> @misc{FedorovaAD2022, author = {Fedorova Alla}, year = {2022}, title = {UMI_for_riboseq}, publisher = {GitHub}, journal = {GitHub repository}, url = {https://github.com/triasteran/RiboGalaxy_with_ansible/blob/main/toolshed_tools/UMI_riboseq_tool/UMI.py}, }