view armdb_mirbase.py @ 8:6f0fa933b3ed draft

Uploaded
author glogobyte
date Wed, 20 Oct 2021 15:16:49 +0000
parents 40c50594116c
children
line wrap: on
line source

import subprocess
import argparse
import time
import urllib.request
from multiprocessing import Process, Queue
import itertools

#---------------------------------Arguments------------------------------------------

subprocess.call(['mkdir', 'out'])
parser = argparse.ArgumentParser()

parser.add_argument("-pos", "--positions", help="number of additional nucleotides", action="store")
parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store")
parser.add_argument("-gen", "--genome", help="genome version", action="store")
parser.add_argument("-gff3", "--gff", help="gff file",action="store")
args = parser.parse_args()

#-----------------------Download and read of the gff3 file---------------------------------

def read_url(q):

    url = 'ftp://mirbase.org/pub/mirbase/CURRENT/genomes/'+args.gff
    data = urllib.request.urlopen(url).read()
    file_mirna = data.decode('utf-8')
    file_mirna = file_mirna.split("\n")
    q.put(file_mirna)


#-----------------------Export of the original gff3 file---------------------------------

def write_gff(file_mirna):
    f = open('original_mirnas.bed', "w")

    for i in range(len(file_mirna)):
        f.write(file_mirna[i] + "\n")

#------------------------Process and export of the file with mature mirnas-------------------------------

def new_gff(file_mirna):

    mirna = []   		     # new list with shifted mirnas
    positions =int(args.positions)   # positions shifted
    print(str(positions)+" positions shifted")
    names=[]

    for i in range(len(file_mirna)):

        # Remove lines which conatain the word "primary"
        if "primary" not in file_mirna[i]:
            mirna.append(file_mirna[i])

            # Check if the line starts with "chr"
            if "chr" in file_mirna[i]:
                a=file_mirna[i].split("\t")[0]
                b=file_mirna[i].split("\t")[6]
                c=file_mirna[i].split("=")[3].split(";")[0]
                names.append([a,b,c])

    names.sort()
    sublists=[]

    [sublists.append([item] * names.count(item)) for item in names if names.count(item)>=2]
    sublists.sort()
    sublists=list(sublists for sublists, _ in itertools.groupby(sublists))
    unique_names=[[x[0][0],x[0][2]] for x in sublists]

    for x in unique_names:
        flag = 0
        for i in range(len(mirna)):

              if "chr" in mirna[i] and mirna[i].split("=")[3].split(";")[0]==x[1] and x[0]==mirna[i].split("\t")[0]:
                 flag+=1
                 ktr=mirna[i].split(";")[0]+";"+mirna[i].split(";")[1]+";"+mirna[i].split(";")[2]+"-{"+str(flag)+"}"+";"+mirna[i].split(";")[3]
                 mirna[i]=ktr


    f = open('shifted_mirnas.bed', "w")

    for i in range(len(mirna)):

        if "chr" in mirna[i]:

            # change the name of current mirna
            mirna_name_1 = mirna[i].split("=")[3]
            mirna_name_2 = mirna[i].split("=")[4]
            mirna_name_1 = mirna_name_1.split(";")[0]+"_"+mirna_name_2+"_"+mirna[i].split("\t")[0]
            mirna[i] = mirna[i].replace("miRNA", mirna_name_1)

            # Shift the position of mirna
            start = mirna[i].split("\t")[3]
            end = mirna[i].split("\t")[4]
            shift_start = int(start)-positions	# shift the interval to the left
            shift_end = int(end)+positions	# shift the interval to the right

            # Replace the previous intervals with the new
            mirna[i] = mirna[i].replace(start, str(shift_start))
            mirna[i] = mirna[i].replace(end, str(shift_end))

            f.write(mirna[i] + "\n")

    f.close()


#------------------------Extract the sequences of the Custom Arms with getfasta tool-------------------------------

def bedtool(genome):
    subprocess.call(["bedtools", "getfasta", "-fi", "/cvmfs/data.galaxyproject.org/byhand/"+genome+"/sam_index/"+genome+".fa", "-bed", "shifted_mirnas.bed", "-fo", "new_ref.fa", "-name", "-s"])
#===================================================================================================================================


if __name__=='__main__':

    starttime = time.time()
    q = Queue()

    # Read the original gff3 file
    p1 = Process(target=read_url(q))
    p1.start()
    p1.join()
    file_mirna=q.get()

    # Export the original gff3 file
    p2 = [Process(target=write_gff(file_mirna))]

    # Create the new gff3 file
    p2.extend([Process(target=new_gff(file_mirna))])
    [x.start() for x in p2]
    [x.join() for x in p2]

    # Extract the sequences of the Custom Arms
    p3 = Process(target=bedtool(args.genome))
    p3.start()
    p3.join()

    print('Runtime: {} seconds'.format(round(time.time() - starttime,2)))