Mercurial > repos > cmonjeau > decompress_an_archive_and_merge
changeset 0:78d968479d52 draft default tip
Imported from capsule None
author | cmonjeau |
---|---|
date | Tue, 07 Jul 2015 08:49:40 -0400 |
parents | |
children | |
files | extract_archive_and_merge.py extract_archive_and_merge.xml repository_dependencies.xml |
diffstat | 3 files changed, 195 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extract_archive_and_merge.py Tue Jul 07 08:49:40 2015 -0400 @@ -0,0 +1,131 @@ +#!/usr/bin/env python +""" + + +Created by Cyril MONJEAUD +Cyril.Monjeaud@irisa.fr +Last modification: 11/19/2014 + +And with the help of Anthony Bretaudeau for some stuff with bz2. + +""" + +import argparse, os, sys, subprocess, tempfile, shutil, gzip, zipfile, tarfile, gzip, bz2, shutil +import glob +from galaxy import eggs +from galaxy import util +from galaxy.datatypes.checkers import * + +def stop_err( msg ): + sys.stderr.write( '%s\n' % msg ) + sys.exit() + +def main(archive, archivename, logfile, logid, workdir, merge, rm_header=0, concat=''): + + # create a temporary repository + #tmp_dir = tempfile.mkdtemp(dir=os.getcwd()) + os.mkdir("decompress_files") + + #open log file + mylog = open(logfile, "w"); + + is_gzipped, is_gzvalid = check_gzip( archive ) + is_bzipped, is_bzvalid = check_bz2( archive ) + + # extract all files in a temp directory + # test if is a zip file + if check_zip( archive ): + with zipfile.ZipFile(archive, 'r') as myarchive: + myarchive.extractall("decompress_files") + + # test if is a tar file + elif tarfile.is_tarfile( archive ): + mytarfile=tarfile.TarFile.open(archive) + mytarfile.extractall("decompress_files") + mytarfile.close() + + # test if is a gzip file + elif is_gzipped and is_gzvalid : + mygzfile = gzip.open(archive, 'rb') + + myungzippedfile = open ("decompress_files/"+os.path.splitext(os.path.basename(archivename))[0], 'wb', 2**20) + for i in iter(lambda: mygzfile.read(2**20), ''): + myungzippedfile.write(i) + + myungzippedfile.close() + mygzfile.close() + + elif is_bzipped and is_bzvalid: + mybzfile = bz2.BZ2File(archive, 'rb') + + myunbzippedfile = open ("decompress_files/"+os.path.splitext(os.path.basename(archivename))[0], 'wb', 2**20) + for i in iter(lambda: mybzfile.read(2**20), ''): + myunbzippedfile.write(i) + + myunbzippedfile.close() + mybzfile.close() + + + # test if merge is enable + if merge == "true": + mylog.write("Merge option is enabled with "+str(rm_header)+" lines to deleted\n\n") + myfinalfile = open(concat, "w"); + for myfile in listdirectory("decompress_files"): + myopenfile = open(myfile, "r") + nblinesremove=0 + mylog.write(os.path.basename(myfile)+" is extracted from the archive and is added into the result file\n") + for line in myopenfile: + + #if not equal, don't write + if int(rm_header) != nblinesremove: + nblinesremove=nblinesremove+1 + else: + # write the line into the final file + myfinalfile.write(line) + + myfinalfile.close() + + shutil.rmtree("decompress_files") + + else: + # if merge is disable + mylog.write("Merge option is disabled\n\n") + + # move all files (recursively) in the working dir + for myfile in listdirectory("decompress_files"): + myfileclean = myfile.replace(" ", "\ ") + + mylog.write(os.path.basename(myfileclean)+" is extracted from the archive \n") + + fileext = os.path.splitext(myfile)[1].replace(".", "") + + # if no extension + if fileext == '': + shutil.move(os.path.abspath(myfile), os.path.abspath(myfile)+".txt") + + if fileext == 'fa': + shutil.move(os.path.abspath(myfile), os.path.abspath(myfile).replace(".fa", ".fasta")) + + if fileext == 'fq': + shutil.move(os.path.abspath(myfile), os.path.abspath(myfile).replace(".fq", ".fastq")) + + mylog.write("\nPlease refresh your history if all files are not present\n") + mylog.close() + + + +# parse the directory and return files path (in a tab) +def listdirectory(path): + myfile=[] + l = glob.glob(path+'/*') + for i in l: + # if directory + if os.path.isdir(i): + myfile.extend(listdirectory(i)) + # else put the file in the tab + else: + myfile.append(i) + return myfile + + +if __name__=="__main__": main(*sys.argv[1:])
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extract_archive_and_merge.xml Tue Jul 07 08:49:40 2015 -0400 @@ -0,0 +1,60 @@ +<tool id="extract_archive_merge" name="Decompress an archive" version="1.0.0" force_history_refresh="True" > + <description>in zip, gz, tar.gz, fastq.gz, fastq.bz2 or tar.bz2 format</description> + <command interpreter="python"> + +extract_archive_and_merge.py $input_archive "$input_archive.display_name" $log $log.id $__new_file_path__ $options_merge.merge +#if str( $options_merge.merge ) == 'true' +${options_merge.rm_header} +$concat +#end if + + </command> + <inputs> + <param name="input_archive" type="data" format="zip,tar.gz,tar.bz2,fastq.gz,fastq.bz2" label="Archive name" help="Accepts zip, tar.gz and tar.bz2" /> + <conditional name="options_merge"> + <param name="merge" type="select" format="text" label="Merges all files into one"> + <option value="false">No</option> + <option value="true">Yes</option> + </param> + <when value="false"> + </when> + <when value="true"> + <param name="rm_header" type="integer" value="0" label="Header lines to delete" help="number of lines to delete at the beginning of each file"/> + </when> + </conditional> + </inputs> + <outputs> + <data format="txt" name="log" label="decompress_an_archive.log" /> + <data format="txt" name="additional" label="additional file with ${tool.name}" hidden="true"> + <discover_datasets pattern="__designation_and_ext__" directory="decompress_files" visible="true" /> + </data> + + <data format="txt" name="concat" label="merge_file"> + <filter>(options_merge['merge'] == 'true')</filter> + </data> + </outputs> + <help> +**Tool documentation** + +This tool simply decompresses an archive file (zip, gz, tar.gz, fastq.gz, fastq.bz2 or tar.bz2) and merges all files into only one. +If the merge option is enabled, you can delete as many header lines as you need. + +WARNING : the filename should not have special characters (space, brackets, ...), please rename it!!! + +-------- + +**Created and integrated by:** + +Cyril Monjeaud + +GenOuest Bio-informatics Core Facility + +UMR 6074 IRISA INRIA-CNRS-UR1 Rennes (France) + +support@genouest.org + +If you use this tool in Galaxy, please cite : + +`Y. Le Bras, A. Roult, C. Monjeaud, M. Bahin, O. Quenez, C. Heriveau, A. Bretaudeau, O. Sallou, O. Collin, Towards a Life Sciences Virtual Research Environment : an e-Science initiative in Western France. JOBIM 2013. <https://www.e-biogenouest.org/resources/128>`_ + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/repository_dependencies.xml Tue Jul 07 08:49:40 2015 -0400 @@ -0,0 +1,4 @@ +<?xml version="1.0"?> +<repositories description="This tool requires archive datatypes definitions (e.g. zip, tar.gz, etc. format)."> + <repository changeset_revision="9e4c4b66b4a0" name="archive_datatypes" owner="cmonjeau" toolshed="https://toolshed.g2.bx.psu.edu" /> +</repositories>