diff UMI_riboseq_processing/UMI.py @ 0:ef98c6fad2a2 draft

Uploaded
author triasteran
date Sun, 19 Jun 2022 11:29:41 +0000
parents
children 5d0d5933d370
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/UMI_riboseq_processing/UMI.py	Sun Jun 19 11:29:41 2022 +0000
@@ -0,0 +1,48 @@
+import itertools
+from sys import argv, exit
+from itertools import zip_longest
+
+def grouper(iterable, n, fillvalue=None):
+    args = [iter(iterable)] * n
+    return zip_longest(*args, fillvalue=fillvalue)
+
+
+chunk_size=4
+
+
+def trimandpaste(pathToFastaFile, output):
+    #filename = pathToFastaFile.split('/')[-1]
+    output = open(output,"w")
+    with open(pathToFastaFile) as f:
+        for lines in grouper(f, chunk_size, ""): #for every chunk_sized chunk
+            header = lines[0]
+            seq = lines[1]
+            sep = lines[2]
+            qual = lines[3]
+            trimmed_seq = seq[2:-11]+seq[-6:-1]+"\n" # fooprint + barcode
+            UMI = seq[0:2]+seq[-11:-6] #7nt in total 
+            split_header = header.split(" ")
+            new_header = split_header[0]+"_"+UMI+" "+split_header[1]
+            if qual[-1:] == "\n":
+                new_qual = qual[2:-11]+qual[-6:-1]+"\n"
+            else:
+                new_qual = qual[2:-10]+qual[-6:-1]
+            output.write(new_header)
+            output.write(trimmed_seq) 
+            output.write(sep) 
+            output.write(new_qual)
+
+    output.close() 
+
+def main():
+    if len(argv) != 3: 
+        exit("Usage: 2 arguments required\n1: Path to fasta file \n2: name of output file")
+
+    # Get paths
+    pathToFastaFile = argv[1]
+    output = argv[2]
+        
+    trimandpaste(pathToFastaFile, output)
+
+if __name__ == "__main__":
+    main()