annotate extract_archive_and_merge.py @ 0:78d968479d52 draft default tip

Imported from capsule None
author cmonjeau
date Tue, 07 Jul 2015 08:49:40 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
1 #!/usr/bin/env python
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
2 """
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
3
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
4
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
5 Created by Cyril MONJEAUD
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
6 Cyril.Monjeaud@irisa.fr
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
7 Last modification: 11/19/2014
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
8
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
9 And with the help of Anthony Bretaudeau for some stuff with bz2.
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
10
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
11 """
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
12
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
13 import argparse, os, sys, subprocess, tempfile, shutil, gzip, zipfile, tarfile, gzip, bz2, shutil
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
14 import glob
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
15 from galaxy import eggs
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
16 from galaxy import util
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
17 from galaxy.datatypes.checkers import *
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
18
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
19 def stop_err( msg ):
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
20 sys.stderr.write( '%s\n' % msg )
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
21 sys.exit()
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
22
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
23 def main(archive, archivename, logfile, logid, workdir, merge, rm_header=0, concat=''):
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
24
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
25 # create a temporary repository
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
26 #tmp_dir = tempfile.mkdtemp(dir=os.getcwd())
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
27 os.mkdir("decompress_files")
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
28
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
29 #open log file
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
30 mylog = open(logfile, "w");
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
31
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
32 is_gzipped, is_gzvalid = check_gzip( archive )
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
33 is_bzipped, is_bzvalid = check_bz2( archive )
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
34
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
35 # extract all files in a temp directory
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
36 # test if is a zip file
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
37 if check_zip( archive ):
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
38 with zipfile.ZipFile(archive, 'r') as myarchive:
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
39 myarchive.extractall("decompress_files")
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
40
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
41 # test if is a tar file
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
42 elif tarfile.is_tarfile( archive ):
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
43 mytarfile=tarfile.TarFile.open(archive)
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
44 mytarfile.extractall("decompress_files")
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
45 mytarfile.close()
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
46
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
47 # test if is a gzip file
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
48 elif is_gzipped and is_gzvalid :
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
49 mygzfile = gzip.open(archive, 'rb')
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
50
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
51 myungzippedfile = open ("decompress_files/"+os.path.splitext(os.path.basename(archivename))[0], 'wb', 2**20)
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
52 for i in iter(lambda: mygzfile.read(2**20), ''):
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
53 myungzippedfile.write(i)
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
54
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
55 myungzippedfile.close()
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
56 mygzfile.close()
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
57
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
58 elif is_bzipped and is_bzvalid:
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
59 mybzfile = bz2.BZ2File(archive, 'rb')
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
60
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
61 myunbzippedfile = open ("decompress_files/"+os.path.splitext(os.path.basename(archivename))[0], 'wb', 2**20)
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
62 for i in iter(lambda: mybzfile.read(2**20), ''):
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
63 myunbzippedfile.write(i)
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
64
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
65 myunbzippedfile.close()
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
66 mybzfile.close()
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
67
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
68
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
69 # test if merge is enable
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
70 if merge == "true":
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
71 mylog.write("Merge option is enabled with "+str(rm_header)+" lines to deleted\n\n")
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
72 myfinalfile = open(concat, "w");
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
73 for myfile in listdirectory("decompress_files"):
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
74 myopenfile = open(myfile, "r")
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
75 nblinesremove=0
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
76 mylog.write(os.path.basename(myfile)+" is extracted from the archive and is added into the result file\n")
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
77 for line in myopenfile:
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
78
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
79 #if not equal, don't write
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
80 if int(rm_header) != nblinesremove:
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
81 nblinesremove=nblinesremove+1
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
82 else:
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
83 # write the line into the final file
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
84 myfinalfile.write(line)
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
85
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
86 myfinalfile.close()
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
87
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
88 shutil.rmtree("decompress_files")
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
89
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
90 else:
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
91 # if merge is disable
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
92 mylog.write("Merge option is disabled\n\n")
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
93
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
94 # move all files (recursively) in the working dir
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
95 for myfile in listdirectory("decompress_files"):
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
96 myfileclean = myfile.replace(" ", "\ ")
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
97
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
98 mylog.write(os.path.basename(myfileclean)+" is extracted from the archive \n")
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
99
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
100 fileext = os.path.splitext(myfile)[1].replace(".", "")
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
101
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
102 # if no extension
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
103 if fileext == '':
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
104 shutil.move(os.path.abspath(myfile), os.path.abspath(myfile)+".txt")
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
105
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
106 if fileext == 'fa':
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
107 shutil.move(os.path.abspath(myfile), os.path.abspath(myfile).replace(".fa", ".fasta"))
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
108
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
109 if fileext == 'fq':
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
110 shutil.move(os.path.abspath(myfile), os.path.abspath(myfile).replace(".fq", ".fastq"))
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
111
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
112 mylog.write("\nPlease refresh your history if all files are not present\n")
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
113 mylog.close()
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
114
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
115
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
116
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
117 # parse the directory and return files path (in a tab)
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
118 def listdirectory(path):
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
119 myfile=[]
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
120 l = glob.glob(path+'/*')
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
121 for i in l:
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
122 # if directory
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
123 if os.path.isdir(i):
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
124 myfile.extend(listdirectory(i))
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
125 # else put the file in the tab
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
126 else:
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
127 myfile.append(i)
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
128 return myfile
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
129
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
130
78d968479d52 Imported from capsule None
cmonjeau
parents:
diff changeset
131 if __name__=="__main__": main(*sys.argv[1:])