Mercurial > repos > dereeper > pgap
diff PGAP-1.2.1/Converter_NCBINewFormatData.pl @ 0:83e62a1aeeeb draft
Uploaded
author | dereeper |
---|---|
date | Thu, 24 Jun 2021 13:51:52 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PGAP-1.2.1/Converter_NCBINewFormatData.pl Thu Jun 24 13:51:52 2021 +0000 @@ -0,0 +1,300 @@ +#!/usr/bin/perl +use strict; +use warnings; +use Getopt::Long; + + + +my %optionHash=qw(); +my $inprefix=""; +my $outprefix=""; +my $indir=""; +my $outdir=""; +my @inList; +my @outList; + +GetOptions(\%optionHash,"inprefix:s","outprefix:s","indir:s","outdir:s","help|h!"); + + +if(scalar(keys %optionHash)==0){ + &print_usage(""); +} + +if(exists($optionHash{"h"}) or exists($optionHash{"help"}) ){ + &print_usage(""); +} + +if( exists($optionHash{"inprefix"} ) ){ + $inprefix = $optionHash{"inprefix"}; + @inList=split(/\+/,$inprefix); +}else{ + &print_usage("--inprefix should be provided!"); +} + +if( exists($optionHash{"outprefix"} ) ){ + $outprefix = $optionHash{"outprefix"}; + @outList=split(/\+/,$outprefix); +} + +if( exists($optionHash{"indir"} ) ){ + $indir = $optionHash{"indir"}; +}else{ + &print_usage("--indir should be provided!"); +} + +if( exists($optionHash{"outdir"} ) ){ + $outdir = $optionHash{"outdir"}; +}else{ + &print_usage("--outdir should be provided!"); +} + + +if( $inprefix eq "" or $indir eq "" or $outdir eq "" ){ + &print_usage(""); +} + +if( scalar(@outList) >0 and scalar(@outList) != scalar(@inList) ){ + &print_usage("If outprefix was provided, the name number should be identical with inprefix"); +} + + +system("mkdir -p $outdir"); + +foreach my $idx (0 .. (scalar(@inList) -1) ){ + my $inname = $inList[$idx]; + my $feature_table = $indir."/".$inname."_feature_table.txt"; + my $faa = $indir."/".$inname."_protein.faa"; + my $fna = $indir."/".$inname."_genomic.fna"; + + my %genePositionHash; + my %genomeSequenceHash; + my %geneAnnotationHash; + my %gene2genomeHash; + my %genefaaHash; + my $count=0; + + my $outname; + + if($outprefix eq ""){ + my @tmp=split(/\./,$inname); + $outname = $tmp[0]; + }else{ + $outname = $outList[$idx]; + } + + # check file + if( ! -e $feature_table){ + print "$feature_table was not found!\n"; + print "$inname was skipped!\n"; + next; + } + + if( ! -e $faa){ + print "$faa was not found!\n"; + print "$inname was skipped!\n"; + next; + } + + if( ! -e $fna){ + print "$fna was not found!\n"; + print "$inname was skipped!\n"; + next; + } + + # read feature + &read_feature($feature_table,\%geneAnnotationHash,\%genePositionHash,\%gene2genomeHash); + + # get genome sequence + &read_genomeSequence($fna,\%genomeSequenceHash); + + # get faa + &read_faa($faa,\%genefaaHash); + + # extract nuc and output + + open(PEP,">$outdir/$outname.pep"); + open(NUC,">$outdir/$outname.nuc"); + open(FUN,">$outdir/$outname.function"); + + foreach my $mygene (keys %genePositionHash){ + if( (!exists( $geneAnnotationHash{$mygene})) or (!exists($gene2genomeHash{$mygene})) or (!exists($genefaaHash{$mygene})) or (!exists($genomeSequenceHash{$gene2genomeHash{$mygene}})) ){ + print $inname . "\t" . $mygene . "\t" . "skipped for insufficient data!\n"; + next; + } + + my $nuc = &getSeq($genePositionHash{$mygene}, $genomeSequenceHash{$gene2genomeHash{$mygene}} ); + + if( $nuc eq ""){ + print $inname . "\t" . $mygene . "\t" . "skipped for insufficient data!\n"; + next; + } + + print PEP ">$mygene\n$genefaaHash{$mygene}\n"; + print NUC ">$mygene\n$nuc\n"; + print FUN "$mygene\t-\t$geneAnnotationHash{$mygene}\n"; + $count++; + } + close(PEP); + close(NUC); + close(FUN); + + print $inname . " -> $outname: $count genes were extracted in total.\n"; +} + + +sub getSeq(){ + (my $pos,my $genomeSeq)=@_; + my @tmp=split(/\|/,$pos); + if( $tmp[1]> length($genomeSeq) ){ + return ""; + } + my $seq = substr($genomeSeq,$tmp[0]-1,$tmp[1]-$tmp[0]+1); + if($tmp[2] eq "-"){ + $seq = &rcseq($seq); + } + return $seq; +} + + + +sub rcseq(){ + (my $seq)=@_; + $seq=uc($seq); + + $_=$seq; + tr/ACGT/TGCA/; + $seq = $_; + return reverse($_); +} + + + +sub read_faa(){ + (my $infile, my $seqHash)=@_; + my $seq=""; + my $name=""; + my @content; + open(F,$infile); + @content=<F>; + close(F); + chomp(@content); + + for (my $line = 0; $line < @content; $line++) { + if($content[$line] =~/^>/ ){ + if($name ne ""){ + $$seqHash{$name}=$seq; + } + my @tmp=split(/\s+/,$content[$line]); + @tmp=split(/\./,$tmp[0]); + + $name=substr($tmp[0], 1, length($tmp[0])-1); + $seq=""; + }else{ + $seq = $seq . $content[$line]; + } + } + $$seqHash{$name}=$seq; +} + + + + + + +sub read_genomeSequence(){ + (my $infile,my $seqHash)=@_; + my $seq=""; + my $name=""; + my @content; + open(F,$infile); + @content=<F>; + close(F); + chomp(@content); + + for (my $line = 0; $line < @content; $line++) { + if($content[$line] =~/^>/ ){ + if($name ne ""){ + $$seqHash{$name}=$seq; + } + my @tmp=split(/\s+/,$content[$line]); + $name=substr($tmp[0], 1, length($tmp[0])-1); + $seq=""; + }else{ + $seq = $seq . $content[$line]; + } + } + $$seqHash{$name}=$seq; +} + + + +sub read_feature(){ + (my $infile, my $annHash,my $posHash,my $gene2genomeHash)=@_; + my @content; + open(F,$infile); + @content=<F>; + close(F); + chomp(@content); + + for (my $line = 1; $line < @content - 1; $line=$line+1) { + my @row=split(/\t/,$content[$line]); + my @nextrow=split(/\t/,$content[$line+1]); + + if($row[1] ne "protein_coding"){ + next; + } + + # in case + if( $row[7]."|".$row[8]."|".$row[9] ne $nextrow[7]."|".$nextrow[8]."|".$nextrow[9]){ + next; + } + + # print $nextrow[10]."\n"; + # print $row[7]."|".$row[8]."|".$row[9]."\n"; + + my @tmp=split(/\./,$nextrow[10]); + my $geneName = $tmp[0]; + + $$annHash{$geneName}=$nextrow[13]; + $$posHash{$geneName}=$row[7]."|".$row[8]."|".$row[9]; + $$gene2genomeHash{$geneName}=$row[6]; + } +} + + +sub print_usage(){ + my $message=shift; +my @usage=qq( +Version: 2016042201 +Usage: perl Converter_NCBINewFormatData.pl [options] + + +Options: + + --inprefix String The prefix of the input files, such as GCF_000007545.1_ASM754v1 + If two or more strains were provided, please join their prefixs with "+" + Such as GCF_000007545.1_ASM754v1+GCF_000008105.1_ASM810v1+GCF_000711315.1_ASM71131v1 + + --indir String The directory of those input files + + --outprefix String The prefix for the output files. + If a value "ty2" was provided, the output files would be: ty2.nuc, ty2.pep, and ty2.function + If two or more strains were provided, please join their prefixs with "+" + If the prefix was not provided, the assembly value would be used as the prefix, such as GCF_000007545 + + --outdir String The directory for the output files + +Note: + 1. Before converting data with this script, please prepare *feature_table.txt, *genomic.fna and *protein.faa files for each strain. + + 2. This script was designed for NCBI new format data only. If part of your data is in the old format, please use the Converter_finished.pl or Converter_draft.pl script to convert the data. + + +); + + print join("\n",@usage)."\n"; + print $message."\n"; + exit; +} + +