diff extract_archive_and_merge.py @ 0:78d968479d52 draft default tip

Imported from capsule None
author cmonjeau
date Tue, 07 Jul 2015 08:49:40 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_archive_and_merge.py	Tue Jul 07 08:49:40 2015 -0400
@@ -0,0 +1,131 @@
+#!/usr/bin/env python
+"""
+
+
+Created by Cyril MONJEAUD
+Cyril.Monjeaud@irisa.fr
+Last modification: 11/19/2014
+
+And with the help of Anthony Bretaudeau for some stuff with bz2.
+
+"""
+
+import argparse, os, sys, subprocess, tempfile, shutil, gzip, zipfile, tarfile, gzip, bz2, shutil
+import glob 
+from galaxy import eggs
+from galaxy import util
+from galaxy.datatypes.checkers import *
+
+def stop_err( msg ):
+    sys.stderr.write( '%s\n' % msg )
+    sys.exit()
+
+def main(archive, archivename, logfile, logid, workdir, merge, rm_header=0, concat=''):
+
+    # create a temporary repository
+    #tmp_dir = tempfile.mkdtemp(dir=os.getcwd())
+    os.mkdir("decompress_files")    
+
+    #open log file
+    mylog = open(logfile, "w");
+
+    is_gzipped, is_gzvalid = check_gzip( archive )
+    is_bzipped, is_bzvalid = check_bz2( archive )
+
+    # extract all files in a temp directory
+    # test if is a zip file
+    if check_zip( archive ):
+	with zipfile.ZipFile(archive, 'r') as myarchive:
+		myarchive.extractall("decompress_files")
+
+    # test if is a tar file
+    elif tarfile.is_tarfile( archive ):
+	mytarfile=tarfile.TarFile.open(archive)
+	mytarfile.extractall("decompress_files")
+	mytarfile.close()
+ 
+    # test if is a gzip file
+    elif is_gzipped and is_gzvalid :
+	mygzfile = gzip.open(archive, 'rb')
+
+	myungzippedfile = open ("decompress_files/"+os.path.splitext(os.path.basename(archivename))[0], 'wb', 2**20)
+	for i in iter(lambda: mygzfile.read(2**20), ''):
+		myungzippedfile.write(i) 
+
+	myungzippedfile.close()
+	mygzfile.close()
+
+    elif is_bzipped and is_bzvalid:
+        mybzfile = bz2.BZ2File(archive, 'rb')
+
+        myunbzippedfile = open ("decompress_files/"+os.path.splitext(os.path.basename(archivename))[0], 'wb', 2**20)
+        for i in iter(lambda: mybzfile.read(2**20), ''):
+                myunbzippedfile.write(i)
+
+        myunbzippedfile.close()
+        mybzfile.close()
+
+		
+    # test if merge is enable
+    if merge == "true":
+	mylog.write("Merge option is enabled with "+str(rm_header)+" lines to deleted\n\n")
+	myfinalfile = open(concat, "w");
+	for myfile in listdirectory("decompress_files"):
+		myopenfile = open(myfile, "r")
+		nblinesremove=0
+		mylog.write(os.path.basename(myfile)+" is extracted from the archive and is added into the result file\n")
+		for line in myopenfile:	
+
+			#if not equal, don't write	
+			if int(rm_header) != nblinesremove:
+				nblinesremove=nblinesremove+1
+			else:
+				# write the line into the final file			
+				myfinalfile.write(line)
+	
+	myfinalfile.close()
+
+	shutil.rmtree("decompress_files")
+
+    else:
+	# if merge is disable
+        mylog.write("Merge option is disabled\n\n")
+
+   	# move all files (recursively) in the working dir
+    	for myfile in listdirectory("decompress_files"):
+		myfileclean = myfile.replace(" ", "\ ")
+
+		mylog.write(os.path.basename(myfileclean)+" is extracted from the archive \n")
+
+		fileext = os.path.splitext(myfile)[1].replace(".", "")
+
+		# if no extension
+		if fileext == '':
+                        shutil.move(os.path.abspath(myfile), os.path.abspath(myfile)+".txt")
+
+		if fileext == 'fa':
+                        shutil.move(os.path.abspath(myfile), os.path.abspath(myfile).replace(".fa", ".fasta"))
+
+		if fileext == 'fq':
+			shutil.move(os.path.abspath(myfile), os.path.abspath(myfile).replace(".fq", ".fastq"))
+
+   	mylog.write("\nPlease refresh your history if all files are not present\n")
+    	mylog.close()
+
+
+
+# parse the directory and return files path (in a tab)
+def listdirectory(path):
+	myfile=[]
+	l = glob.glob(path+'/*') 
+    	for i in l:
+		# if directory
+        	if os.path.isdir(i): 
+			myfile.extend(listdirectory(i))
+		# else put the file in the tab      	  	
+		else:
+			myfile.append(i)
+    	return myfile
+
+
+if __name__=="__main__": main(*sys.argv[1:])