view stacks.py @ 2:c9e10e0d6c10

fix discard file
author cmonjeau
date Mon, 24 Aug 2015 15:39:12 +0000
parents d6ba40f6c824
children 0e0ff9e9c761
line wrap: on
line source

"""

STACKS METHODS FOR GALAXY

Created by Cyril Monjeaud & Yvan Le Bras
Cyril.Monjeaud@irisa.fr
yvan.le_bras@irisa.fr

Last modifications : 01/22/2014


"""

import os, sys, re, shutil
import glob 
import collections
import gzip, zipfile, tarfile
import subprocess
from galaxy.datatypes.checkers import *


"""

STACKS COMMON METHODS

galaxy_config_to_tabfiles(input_config)
galaxy_config_to_tabfiles_for_STACKS(input_config)
extract_compress_files_from_tabfiles(tab_files, tmp_input_dir)
create_symlinks_from_tabfiles(tab_files, tmp_input_dir)

"""
def galaxy_config_to_tabfiles(input_config):

	tab_files=collections.OrderedDict()
	for line in open(input_config, "r").readlines():
		if line.strip() != '':
			extract=line.strip().split("::")
			tab_files[extract[0].replace(" (", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1]

	# tabfiles[name]-> path
	return tab_files


def galaxy_config_to_tabfiles_for_STACKS(input_config):

	tab_files=collections.OrderedDict()
	for line in open(input_config, "r").readlines():
		if line.strip() != '':
			extract=line.strip().split("::")
			parse_name=re.search("^STACKS.*\((.*\.[ATCG]*\.fq)\)$", extract[0])
			# rename galaxy name in a short name
			if parse_name:				
				extract[0]=parse_name.groups(1)[0]

			tab_files[extract[0].replace(" (", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1]
			
	# tabfiles[name]-> path
	return tab_files


def extract_compress_files_from_tabfiles(tab_files, tmp_input_dir):
	
	# for each file
	for key in tab_files.keys():
		#test if is zip file
		if (check_zip( tab_files[key] )):

			# extract all files names and added it in the tab
			myarchive = zipfile.ZipFile(tab_files[key], 'r')
			for i in myarchive.namelist():
				tab_files[i]=tmp_input_dir+"/"+i

			# extract all files
			myarchive.extractall(tmp_input_dir)

			#remove compress file from the tab
			del tab_files[key]

		#test if is tar.gz file
		else:
			if tarfile.is_tarfile( tab_files[key] ) and check_gzip( tab_files[key] ):
				# extract all files names and added it in the tab
				mygzfile = tarfile.open(tab_files[key], 'r')
			
				for i in mygzfile.getnames():
					tab_files[i]=tmp_input_dir+"/"+i
		
				# extract all files
				mygzfile.extractall(tmp_input_dir)

				#remove compress file from the tab
				del tab_files[key]



def create_symlinks_from_tabfiles(tab_files, tmp_input_dir):

	for key in tab_files.keys():
		#print "file single: "+key+" -> "+tab_files[key]
		#create a sym_link in our temp dir
		if not os.path.exists(tmp_input_dir+'/'+key):
			os.symlink(tab_files[key], tmp_input_dir+'/'+key)


"""

PROCESS RADTAGS METHODS

generate_additional_file(tmp_output_dir, output_archive)

"""

def change_outputs_procrad_name(tmp_output_dir, sample_name):

	list_files = glob.glob(tmp_output_dir+'/*')

	for file in list_files:
		# change sample name
		new_file_name=os.path.basename(file.replace("_",".").replace("sample", sample_name))

		# transform .fa -> .fasta or .fq->.fastq
		if os.path.splitext(new_file_name)[1] == ".fa":
			new_file_name = os.path.splitext(new_file_name)[0]+'.fasta'
		else:
			new_file_name = os.path.splitext(new_file_name)[0]+'.fastq'

		shutil.move(tmp_output_dir+'/'+os.path.basename(file), tmp_output_dir+'/'+new_file_name)


def generate_additional_archive_file(tmp_output_dir, output_archive):

	list_files = glob.glob(tmp_output_dir+'/*')

        myzip=zipfile.ZipFile("archive.zip.temp", 'w', allowZip64=True)

	# for each fastq file
	for fastq_file in list_files:
		# add file to the archive output
		myzip.write(fastq_file, os.path.basename(fastq_file))

	shutil.move("archive.zip.temp", output_archive)


"""

DENOVOMAP METHODS

check_fastq_extension_and_add(tab_files, tmp_input_dir)

"""

def check_fastq_extension_and_add(tab_files, tmp_input_dir):
	
	# for each file
	for key in tab_files.keys():

		if not re.search("\.fq$", key) and not re.search("\.fastq$", key) and not re.search("\.fa$", key) and not re.search("\.fasta$", key):
			# open the file
			myfastxfile=open(tab_files[key], 'r')
			
			# get the header
			line = myfastxfile.readline()
		        line = line.strip()

			# fasta rapid test
			if line.startswith( '>' ):
                        	tab_files[key+".fasta"]=tab_files[key]
				del tab_files[key]
			# fastq rapid test
			elif line.startswith( '@' ):
				tab_files[key+".fq"]=tab_files[key]
				del tab_files[key]
			else:
				print "[WARNING] : your input file "+key+" was not extension and is not recognize as a Fasta or Fastq file"
		
			myfastxfile.close()


"""

REFMAP METHODS

"""

def check_sam_extension_and_add(tab_files, tmp_input_dir):
	
	# for each file
	for key in tab_files.keys():

		if not re.search("\.sam$", key):
			# add the extension
			tab_files[key+".sam"]=tab_files[key]
			del tab_files[key]






"""

PREPARE POPULATION MAP METHODS

generate_popmap_for_denovo(tab_files, infos_file, pop_map)
generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map)


"""
def generate_popmap_for_denovo(tab_files, infos_file, pop_map):

	# initiate the dict : barcode -> tab[seq]
	fq_name_for_barcode={}

	for key in tab_files:
		single_barcode=re.search("([ATCG]*)(\.fq|\.fastq)", key).groups(0)[0]
		fq_name_for_barcode[single_barcode]=key

	# open the infos file and output file
	my_open_info_file=open(infos_file, 'r')
	my_output_file=open(pop_map, 'w')

	# conversion tab for population to integer
	pop_to_int=[]

	# write infos into the final output
	for line in my_open_info_file:

		parse_line=re.search("(^[ATCG]+)\t(.*)", line.strip())

		if not parse_line:
			print "[WARNING] Wrong input infos file structure : "+line
		else:
			barcode=parse_line.groups(1)[0]
			population_name=parse_line.groups(1)[1]

			# if its the first meet with the population
			if population_name not in pop_to_int:
				pop_to_int.append(population_name)		

			# manage ext if present, because the population map file should not have the ext
			if re.search("(\.fq$|\.fastq$)", fq_name_for_barcode[barcode]):
				fqfile=os.path.splitext(fq_name_for_barcode[barcode])[0]
			else:
				fqfile=fq_name_for_barcode[barcode]


			# write in the file
			my_output_file.write(fqfile+"\t"+str(pop_to_int.index(population_name))+"\n")

	# close files
	my_output_file.close()
	my_open_info_file.close()




def generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map):

	# initiate the dict : barcode -> tab[seq]
	seq_id_for_barcode={}

	# initiate the dict : barcode -> sam_name
	sam_name_for_barcode={}

	### Parse fastqfiles ###
	# insert my barcode into a tab with sequences ID associated
	for fastq_file in tab_fq_files.keys():
		single_barcode=re.search("([ATCG]*)(\.fq|\.fastq)", fastq_file).groups(0)[0]
        
        	# open the fasq file
        	open_fastqfile=open(tab_fq_files[fastq_file], 'r')
		
		# for each line, get the seq ID
		tab_seq_id=[]
		for line in open_fastqfile:
		    my_match_seqID=re.search("^@([A-Z0-9]+\.[0-9]+)\s.*", line)
		    if my_match_seqID:    
		        tab_seq_id.append(my_match_seqID.groups(0)[0])

		# push in a dict the tab of seqID for the current barcode
		seq_id_for_barcode[single_barcode]=tab_seq_id


	### Parse samfiles and get the first seq id ###
	for sam_file in tab_sam_files.keys():

		# open the sam file
        	open_samfile=open(tab_sam_files[sam_file], 'r')
		
		# get the first seq id
		first_seq_id=''
		for line in open_samfile:
			if not re.search("^@", line):
				first_seq_id=line.split("\t")[0]
				break


		# map with seq_id_for_barcode structure
		for barcode in seq_id_for_barcode:
			for seq in seq_id_for_barcode[barcode]:
				if seq == first_seq_id:
					#print "sam -> "+sam_file+" seq -> "+first_seq_id+" barcode -> "+barcode
					sam_name_for_barcode[barcode]=sam_file
					break

	# open the infos file and output file
	my_open_info_file=open(infos_file, 'r')
	my_output_file=open(pop_map, 'w')

	# conversion tab for population to integer
	pop_to_int=[]

	# write infos into the final output
	for line in my_open_info_file:
		parse_line=re.search("(^[ATCG]+)\t(.*)", line)

		if not parse_line:
			print "[WARNING] Wrong input infos file structure : "+line
		else:

			# if its the first meet with the population
			if parse_line.groups(1)[1] not in pop_to_int:
				pop_to_int.append(parse_line.groups(1)[1])		

			# manage ext if present, because the population map file should not have the ext
			if re.search("\.sam", sam_name_for_barcode[parse_line.groups(1)[0]]):
				samfile=os.path.splitext(sam_name_for_barcode[parse_line.groups(1)[0]])[0]
			else:
				samfile=sam_name_for_barcode[parse_line.groups(1)[0]]

			# write in the file
			my_output_file.write(samfile+"\t"+str(pop_to_int.index(parse_line.groups(1)[1]))+"\n")

	# close files
	my_output_file.close()
	my_open_info_file.close()


"""

STACKS POPULATION


"""


def extract_compress_files(myfile, tmp_input_dir):

        #test if is zip file
        if (check_zip( myfile )):

                # extract all files names and added it in the tab
                myarchive = zipfile.ZipFile(myfile, 'r')

                # extract all files
                myarchive.extractall(tmp_input_dir)


        #test if is tar.gz file
        else:
                # extract all files names and added it in the tab
                mygzfile = tarfile.open(myfile, 'r')

                # extract all files
                mygzfile.extractall(tmp_input_dir)