diff source/fastq_uniq.c @ 0:816cb55b5a2d draft default tip

planemo upload for repository https://github.com/portiahollyoak/Tools commit c4769fd68ad9583d4b9dbdf212e4ecb5968cef1c-dirty
author portiahollyoak
date Thu, 02 Jun 2016 11:34:51 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/fastq_uniq.c	Thu Jun 02 11:34:51 2016 -0400
@@ -0,0 +1,373 @@
+/* This program was used to remove duplicates in paired FASTQ sequences, 
+ * which is usually appeared in mate pair libraries.
+ *
+ * This file and its partner was written by Haibin Xu, December 2011.
+ */
+
+#ifndef MAX_FILE_NUMBER
+	#define MAX_FILE_NUMBER 1000
+#endif
+
+#include <unistd.h>
+#include "fastq_pair_array.h"
+
+void fastq_uniq_usage()
+{
+	fprintf(stderr, "-i : The input file list of paired FSATQ sequence files [FILE IN]\n");
+	fprintf(stderr, "        Maximum 1000 pairs\n");
+	fprintf(stderr, "\n");
+	fprintf(stderr, "     This parameter is used to specify a list of paired sequence files in\n");
+	fprintf(stderr, "     FASTQ format as input, in which two adjacent files with reads in the\n");
+	fprintf(stderr, "     same order belong to a pair.\n");
+	fprintf(stderr, "\n");
+	fprintf(stderr, "-t : Output sequence format [q/f/p]\n");
+	fprintf(stderr, "        q : FASTQ format into TWO output files\n");
+	fprintf(stderr, "        f : FASTA format into TWO output files\n");
+	fprintf(stderr, "        p : FASTA format into ONE output file\n");
+	fprintf(stderr, "        default = q\n");
+	fprintf(stderr, "\n");
+	fprintf(stderr, "     This parameter is used to specify sequence format in output file(s).\n");
+	fprintf(stderr, "     FastUniq could output read pairs into two files in either FASTQ [q]\n");
+	fprintf(stderr, "     or FASTA [f] format, in which reads in the same order belonging to a\n");
+	fprintf(stderr, "     pair. FastUniq could also output read pairs into a single file in\n");
+	fprintf(stderr, "     FASTA format [p], in which adjacent reads belonging to a pair.\n");
+	fprintf(stderr, "\n");
+	fprintf(stderr, "-o : The first output file [FILE OUT]\n");
+	fprintf(stderr, "\n");
+	fprintf(stderr, "-p : The second output file [FILE OUT]\n");
+	fprintf(stderr, "     Optional. ONLY required when output sequence format(-t) is specify as\n");
+	fprintf(stderr, "     [q] or [f].\n");
+	fprintf(stderr, "\n");
+	fprintf(stderr, "-c : Types of sequence descriptions for output [0/1]\n");
+	fprintf(stderr, "        0 : The raw descriptions\n");
+	fprintf(stderr, "        1 : New serial numbers assigned by FastUniq\n");
+	fprintf(stderr, "        default = 0\n");
+	fprintf(stderr, "\n");
+	return;
+}
+
+int main (int argc, const char * argv[])
+{
+	FILE *fp_in_list, *fp_in_left, *fp_in_right, *fp_out_left, *fp_out_right;
+	char str_in_left[MAX_FILE_NUMBER][1000], str_in_right[MAX_FILE_NUMBER][1000];
+	char str_in_list[1000], str_out_left[1000], str_out_right[1000];
+	char s_left[1000], s_right[1000];
+	char output_format;
+	int description_type;
+	int flag_i=0, flag_o=0, flag_t=0, flag_p=0, flag_c=0;
+	char ch;
+	FASTQ_PAIR *fq_pair;
+	FASTQ_PAIR_ARRAY *fq_pair_array, *temp_fq_pair_array;
+	long i, seq_pair_count;
+	
+	if(argc==1)
+    {
+		fastq_uniq_usage();
+		return 1;
+	}
+	
+	/* initializing */
+	for(i=0;i<MAX_FILE_NUMBER;i++)
+	{
+		str_in_left[i][0]='\0';
+		str_in_right[i][0]='\0';
+	}
+	str_in_list[0]='\0';
+	str_out_left[0]='\0';
+	str_out_right[0]='\0';
+	output_format='\0';
+	
+	/* obtain inputted arguments */
+	while((ch=getopt(argc, argv, "i:t:o:p:c:"))!=-1)
+    {
+        switch(ch)
+		{
+			case 'i':
+                strcpy(str_in_list,optarg);
+				if(strcmp(str_in_list,"")!=0)
+					flag_i=1;
+				else 
+				{
+					fastq_uniq_usage();
+					return 1;
+				}
+                break;
+			case 't':
+				if(strlen(optarg)==1)
+				{
+					if(optarg[0]=='q')
+					{
+						output_format='q';
+						flag_t=1;
+						break;
+					}
+					else if(optarg[0]=='f')
+					{
+						output_format='f';
+						flag_t=1;
+						break;
+					}
+					else if(optarg[0]=='p')
+					{
+						output_format='p';
+						flag_t=1;
+						break;
+					}					
+					else
+					{
+						fastq_uniq_usage();
+						return 1;
+					}
+				}
+				fastq_uniq_usage();
+				return 1;
+			case 'o':
+                strcpy(str_out_left,optarg);
+				if(strcmp(str_out_left,"")!=0)
+					flag_o=1;
+				else 
+				{
+					fastq_uniq_usage();
+					return 1;
+				}
+                break;
+            case 'p':
+                strcpy(str_out_right,optarg);
+				if(strcmp(str_out_right,"")!=0)
+					flag_p=1;
+				else 
+				{
+					fastq_uniq_usage();
+					return 1;
+				}
+                break;
+			case 'c':
+				if(strlen(optarg)==1)
+				{
+					if(optarg[0]=='0')
+					{
+						description_type=0;
+						flag_c=1;
+						break;
+					}
+					else if(optarg[0]=='1')
+					{
+						description_type=1;
+						flag_c=1;
+						break;
+					}
+					else
+					{
+						fastq_uniq_usage();
+						return 1;
+					}
+				}
+				fastq_uniq_usage();
+				return 1;
+            default:
+                fastq_uniq_usage();
+                break;
+		}
+	}
+	
+	/* check inputted arguments */
+	if(flag_i==0)
+    {
+        fprintf(stderr, "Error in input the name of FASTQ file list!\n");
+        return 1;
+    }
+	if(flag_t==0)
+		output_format='q';
+	if(flag_o==0 || (output_format!='p' && flag_p==0))
+	{
+		fprintf(stderr, "Error in output sequence file name!\n");
+		return 1;
+	}
+	if(flag_c==0)
+		description_type=0;
+	
+	/* get pair-end FASTQ file list */
+	if((fp_in_list=fopen(str_in_list, "r"))==NULL)
+    {
+        fprintf(stderr, "Error in open FASTQ file list %s for read!\n",
+                str_in_list);
+        return 1;
+    }
+	for(i=0; !feof(fp_in_list) && i<MAX_FILE_NUMBER;)
+	{
+		/* get the file store left FASTQ sequences */
+		s_left[0]='\0';
+		fgets(s_left, 1000, fp_in_list);
+		if(s_left[0]=='\0')
+			continue;
+		else if(strlen(s_left)>=2 && s_left[strlen(s_left)-1]=='\n')
+			s_left[strlen(s_left)-1]='\0';
+		else
+		{
+			fprintf(stderr, "Error in read from FASTQ file list!\n");
+			return 1;
+		}
+		
+		/* get the file store right FASTQ sequences */
+		s_right[0]='\0';
+		fgets(s_right, 1000, fp_in_list);
+		if(strlen(s_right)>=2)
+		{
+			if(s_right[strlen(s_right)-1]=='\n')
+				s_right[strlen(s_right)-1]='\0';
+		}
+		else
+		{
+			fprintf(stderr, "Error in read from FASTQ file list!\n");
+			return 1;
+		}
+		
+		/* append the fiel name to list array */
+		strcpy(str_in_left[i], s_left);
+		strcpy(str_in_right[i++], s_right);
+	}
+	fclose(fp_in_list);
+	
+	/* check the status of pair-end FASTQ files */
+	for(i=0;i<MAX_FILE_NUMBER;i++)
+	{
+		/* check whether list reached the end */
+		if(str_in_left[i][0]=='\0')
+			break;
+
+		/* check file status */
+		if((fp_in_left=fopen(str_in_left[i], "r"))==NULL)
+		{
+			fprintf(stderr, "Error in open left fastq file %s for read!\n",
+					str_in_left[i]);
+			return 1;
+		}
+		fclose(fp_in_left);
+		
+		if((fp_in_right=fopen(str_in_right[i], "r"))==NULL)
+		{
+			fprintf(stderr, "Error in open right fastq file %s for read!\n",
+					str_in_right[i]);
+			return 1;
+		}
+		fclose(fp_in_right);
+	}
+
+	
+	/* read all pair-end FASTQ sequences into memory */
+	seq_pair_count=0;
+	if((fq_pair_array=fastq_pair_array_create())==NULL)
+	{
+		fprintf(stderr, "Error in allocate enough memory!\n");
+		return 1;
+	}
+	if((temp_fq_pair_array=fastq_pair_array_create())==NULL)
+	{
+		fprintf(stderr, "Error in allocate enough memory!\n");
+		return 1;
+	}
+	for(i=0;i<MAX_FILE_NUMBER;i++)
+	{
+		/* check whether list reached the end */
+		if(str_in_left[i][0]=='\0')
+			break;
+		
+		/* open inputted pair-end FASTQ file */
+		if((fp_in_left=fopen(str_in_left[i], "r"))==NULL)
+		{
+			fprintf(stderr, "Error in open left fastq file %s for read!\n",
+					str_in_left[i]);
+			return 1;
+		}
+		if((fp_in_right=fopen(str_in_right[i], "r"))==NULL)
+		{
+			fprintf(stderr, "Error in open right fastq file %s for read!\n",
+					str_in_right[i]);
+			return 1;
+		}
+		
+		/* read sequences */
+		for(;!feof(fp_in_left) && !feof(fp_in_right);)
+		{
+			fq_pair=NULL;
+			if((fq_pair=fastq_pair_create())==NULL)
+			{
+				fprintf(stderr, "Error in allocate enough memory!\n");
+				return 1;
+			}
+			
+			if(output_format=='f' || output_format=='p')
+			{
+				/* NOT require quality */
+				if(fastq_pair_scanf(fq_pair, fp_in_left, fp_in_right, description_type==0?1:0, 0)!=0)
+				{
+					fastq_pair_remove(fq_pair);
+					break;
+				}
+			}
+			else
+			{
+				/* require quality */
+				if(fastq_pair_scanf(fq_pair, fp_in_left, fp_in_right, description_type==0?1:0, 1)!=0)
+				{
+					fastq_pair_remove(fq_pair);
+					break;
+				}
+			}
+			
+			fastq_pair_array_append(fq_pair, fq_pair_array);
+			fastq_pair_array_append(fq_pair, temp_fq_pair_array);
+			seq_pair_count++;
+		}
+		
+		if(!feof(fp_in_left) && !feof(fp_in_right))
+		{
+			fprintf(stderr, "Error in Reading pair-end FASTQ sequence!\n");
+			return 1;
+		}
+	}
+	
+	/* create memory address index for each BLOCK in a FASTQ_PAIR_ARRAY */
+	fastq_pair_array_generate_index(fq_pair_array);
+	fastq_pair_array_generate_index(temp_fq_pair_array);
+	
+	/* sort the pair-end FASTQ sequences */
+	fastq_pair_array_sort(fq_pair_array, temp_fq_pair_array, 1, seq_pair_count);
+	
+	/* open output fastq file */
+    if((fp_out_left=fopen(str_out_left, "w"))==NULL)
+    {
+        fprintf(stderr, "Error in open left fastq file %s for write!\n",
+                str_out_left);
+        return 1;
+    }
+	
+	if(str_out_right[0]!='\0')
+	{
+		if((fp_out_right=fopen(str_out_right, "w"))==NULL)
+		{
+			fprintf(stderr, "Error in open right fastq file %s for write!\n",
+					str_out_right);
+			return 1;
+		}
+	}
+	
+	/* output the sequence in specific format */
+	if(output_format=='f')
+		fastq_pair_array_printf(fq_pair_array, fp_out_left, fp_out_right, "fa", description_type, 1);
+	else if(output_format=='p')
+		fastq_pair_array_printf(fq_pair_array, fp_out_left, NULL, "fa", description_type, 1);
+	else
+		fastq_pair_array_printf(fq_pair_array, fp_out_left, fp_out_right, "fq", description_type, 1);
+	
+	/* close output files */
+	fclose(fp_out_left);
+	if(str_out_right[0]!='\0')
+		fclose(fp_out_right);
+	
+//	/* free memory */
+//	fastq_pair_array_remove(fq_pair_array);
+	
+    return 0;
+}
+