0
|
1 /******************************************************************************
|
|
2 ** @source genret
|
|
3 **
|
|
4 ** Retrieves various gene related infomration from genome flatfile
|
|
5 **
|
|
6 ** @author Copyright (C) 2012 Hidetoshi Itaya
|
|
7 ** @version 1.0.3
|
|
8 ** @modified 2012/1/20 Hidetoshi Itaya Created!
|
|
9 ** @modified 2013/6/16 Revision 1
|
|
10 ** @modified 2015/2/7 Refactor
|
|
11 ** @@
|
|
12 **
|
|
13 ** This program is free software; you can redistribute it and/or
|
|
14 ** modify it under the terms of the GNU General Public License
|
|
15 ** as published by the Free Software Foundation; either version 2
|
|
16 ** of the License, or (at your option) any later version.
|
|
17 **
|
|
18 ** This program is distributed in the hope that it will be useful,
|
|
19 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
20 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
21 ** GNU General Public License for more details.
|
|
22 **
|
|
23 ** You should have received a copy of the GNU General Public License
|
|
24 ** along with this program; if not, write to the Free Software
|
|
25 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
26 ******************************************************************************/
|
|
27
|
|
28 #include "emboss.h"
|
|
29 #include "glibs.h"
|
|
30
|
|
31
|
|
32
|
|
33
|
|
34 /* @prog genret ***************************************************************
|
|
35 **
|
|
36 ** Retrieves various gene related infomration from genome flatfile
|
|
37 **
|
|
38 ******************************************************************************/
|
|
39
|
|
40 int main(int argc, char *argv[])
|
|
41 {
|
|
42 embInitPV("genret", argc, argv, "GEMBASSY", "1.0.3");
|
|
43
|
|
44 AjPSeqall seqall;
|
|
45 AjPSeq seq = NULL;
|
|
46 AjPStr inseq = NULL;
|
|
47 AjPStr gene = NULL;
|
|
48 AjPStr access = NULL;
|
|
49 AjBool accid = ajTrue;
|
|
50 AjPStr argument = NULL;
|
|
51 AjPFile outfile = NULL;
|
|
52
|
|
53 AjPStr seqid = NULL;
|
|
54 AjPStr restid = NULL;
|
|
55
|
|
56 AjBool valid = ajFalse;
|
|
57 AjBool isseq = ajFalse;
|
|
58 AjBool isgbk = ajFalse;
|
|
59
|
|
60 AjPFilebuff buff = NULL;
|
|
61 AjPFile tmpfile = NULL;
|
|
62 AjPStr tmpname = NULL;
|
|
63
|
|
64 AjPStr regexstr = NULL;
|
|
65 AjPStrTok token = NULL;
|
|
66 AjPRegexp regex = NULL;
|
|
67
|
|
68 AjPStr url = NULL;
|
|
69 AjPStr base = NULL;
|
|
70 AjPStr head = NULL;
|
|
71 AjPStr line = NULL;
|
|
72
|
|
73 seqall = ajAcdGetSeqall("sequence");
|
|
74 access = ajAcdGetString("access");
|
|
75 gene = ajAcdGetString("gene");
|
|
76 argument = ajAcdGetString("argument");
|
|
77 accid = ajAcdGetBoolean("accid");
|
|
78 outfile = ajAcdGetOutfile("outfile");
|
|
79
|
|
80 if(
|
|
81 ajStrMatchC(access, "translation") ||
|
|
82 ajStrMatchC(access, "get_exon") ||
|
|
83 ajStrMatchC(access, "get_exons") ||
|
|
84 ajStrMatchC(access, "get_cdsseq") ||
|
|
85 ajStrMatchC(access, "get_gbkseq") ||
|
|
86 ajStrMatchC(access, "get_geneseq") ||
|
|
87 ajStrMatchC(access, "get_intron") ||
|
|
88 ajStrMatchC(access, "getseq") ||
|
|
89 ajStrMatchC(access, "seq") ||
|
|
90 ajStrMatchC(access, "around_startcodon") ||
|
|
91 ajStrMatchC(access, "around_stopcodon") ||
|
|
92 ajStrMatchC(access, "before_startcodon") ||
|
|
93 ajStrMatchC(access, "before_stopcodon") ||
|
|
94 ajStrMatchC(access, "after_startcodon") ||
|
|
95 ajStrMatchC(access, "after_stopcodon")
|
|
96 )
|
|
97 {
|
|
98 isseq = ajTrue;
|
|
99 }
|
|
100 else if(ajStrMatchC(access, "annotate") ||
|
|
101 ajStrMatchC(access, "output"))
|
|
102 {
|
|
103 isgbk = ajTrue;
|
|
104 }
|
|
105 else
|
|
106 {
|
|
107 ajFmtPrintF(outfile, "gene,%S\n", access);
|
|
108 }
|
|
109
|
|
110 base = ajStrNewC("rest.g-language.org");
|
|
111
|
|
112 ajStrExchangeCC(&argument, " ", "/");
|
|
113 ajStrExchangeCC(&argument, ",", "/");
|
|
114 ajStrExchangeCC(&argument, "\t", "/");
|
|
115 ajStrExchangeCC(&argument, "\r", "/");
|
|
116 ajStrExchangeCC(&argument, "\n", "/");
|
|
117
|
|
118 if(ajStrMatchC(gene, "*"))
|
|
119 {
|
|
120 ajStrInsertK(&gene, 0, '.');
|
|
121 }
|
|
122
|
|
123 if(ajStrPrefixC(gene, "@") || ajStrPrefixC(gene, "list::"))
|
|
124 {
|
|
125 ajStrExchangeCC(&gene, "@", "");
|
|
126 ajStrExchangeCC(&gene, "list::", "");
|
|
127 ajStrAssignS(&tmpname, gene);
|
|
128
|
|
129 tmpfile = ajFileNewInNameS(tmpname);
|
|
130
|
|
131 if(!tmpfile)
|
|
132 {
|
|
133 ajDie("List file (%S) open error\n", tmpname);
|
|
134 }
|
|
135
|
|
136 gene = ajStrNew();
|
|
137
|
|
138 while(ajReadline(tmpfile, &line))
|
|
139 {
|
|
140 ajStrAppendS(&gene, line);
|
|
141 }
|
|
142
|
|
143 ajFileClose(&tmpfile);
|
|
144 ajStrDel(&tmpname);
|
|
145 ajStrDel(&line);
|
|
146 }
|
|
147
|
|
148 tmpname = ajStrNew();
|
|
149 gAssignUniqueName(&tmpname);
|
|
150
|
|
151 while(ajSeqallNext(seqall, &seq))
|
|
152 {
|
|
153 inseq = ajStrNew();
|
|
154
|
|
155 if(!accid)
|
|
156 {
|
|
157 if(gFormatGenbank(seq, &inseq))
|
|
158 {
|
|
159 tmpfile = ajFileNewOutNameS(tmpname);
|
|
160
|
|
161 if(!tmpfile)
|
|
162 {
|
|
163 ajDie("Output file (%S) open error\n", tmpname);
|
|
164 }
|
|
165
|
|
166 ajFmtPrintF(tmpfile, "%S", inseq);
|
|
167
|
|
168 ajFileClose(&tmpfile);
|
|
169
|
|
170 ajFmtPrintS(&url, "http://%S/upload/upl.pl", base);
|
|
171
|
|
172 gFilePostSS(url, tmpname, &restid);
|
|
173
|
|
174 ajStrDel(&url);
|
|
175
|
|
176 ajSysFileUnlinkS(tmpname);
|
|
177 }
|
|
178 else
|
|
179 {
|
|
180 ajWarn("Sequence does not have features\n"
|
|
181 "Proceeding with sequence accession ID\n");
|
|
182 accid = ajTrue;
|
|
183 }
|
|
184 }
|
|
185
|
|
186
|
|
187 ajStrAssignS(&seqid, ajSeqGetAccS(seq));
|
|
188
|
|
189 if(ajStrGetLen(seqid) == 0)
|
|
190 {
|
|
191 ajStrAssignS(&seqid, ajSeqGetNameS(seq));
|
|
192 }
|
|
193
|
|
194 if(ajStrGetLen(seqid) == 0)
|
|
195 {
|
|
196 ajWarn("No valid header information\n");
|
|
197 }
|
|
198
|
|
199 if(accid)
|
|
200 {
|
|
201 ajStrAssignS(&restid, seqid);
|
|
202 if(ajStrGetLen(seqid) == 0)
|
|
203 {
|
|
204 ajDie("Cannot proceed without header with -accid\n");
|
|
205 }
|
|
206
|
|
207 if(!gValID(seqid))
|
|
208 {
|
|
209 ajDie("Invalid accession ID:%S, exiting\n", seqid);
|
|
210 }
|
|
211 }
|
|
212
|
|
213 url = ajStrNew();
|
|
214
|
|
215 if(isgbk)
|
|
216 {
|
|
217 ajFmtPrintS(&url, "http://%S/%S/%S", base, restid, access);
|
|
218 }
|
|
219 else
|
|
220 {
|
|
221 ajFmtPrintS(&url, "http://%S/%S/*/%S/%S", base, restid, access, argument);
|
|
222 }
|
|
223
|
|
224 if(!gFilebuffURLS(url, &buff))
|
|
225 {
|
|
226 ajDie("GET error from %S\n", url);
|
|
227 }
|
|
228
|
|
229 while(ajBuffreadLine(buff, &line))
|
|
230 {
|
|
231 if(isgbk){
|
|
232 ajFmtPrintF(outfile, "%S", line);
|
|
233 continue;
|
|
234 }
|
|
235
|
|
236 ajStrRemoveLastNewline(&line);
|
|
237
|
|
238 regex = ajRegCompC("^>");
|
|
239
|
|
240 if(ajRegExec(regex, line))
|
|
241 {
|
|
242 head = ajStrNew();
|
|
243
|
|
244 ajStrAssignS(&head, line);
|
|
245 ajStrTrimStartC(&head, ">");
|
|
246
|
|
247 valid = ajFalse;
|
|
248
|
|
249 token = ajStrTokenNewC(ajStrNewS(gene), " ,\t\r\n");
|
|
250
|
|
251 while(ajStrTokenNextParse(token, ®exstr))
|
|
252 {
|
|
253 if(ajStrGetLen(regexstr))
|
|
254 {
|
|
255 regex = ajRegComp(regexstr);
|
|
256
|
|
257 if(ajRegExec(regex, line))
|
|
258 {
|
|
259 valid = ajTrue;
|
|
260 if(ajStrIsAlnum(regexstr))
|
|
261 {
|
|
262 ajStrExchangeSC(&gene, regexstr, "");
|
|
263 }
|
|
264 }
|
|
265
|
|
266 ajRegFree(®ex);
|
|
267 }
|
|
268 }
|
|
269 }
|
|
270 else
|
|
271 {
|
|
272 if(valid)
|
|
273 {
|
|
274 if(isseq)
|
|
275 {
|
|
276 ajStrFmtWrap(&line, 60);
|
|
277 ajFmtPrintF(outfile, ">%S\n%S\n", head, line);
|
|
278 }
|
|
279 else
|
|
280 {
|
|
281 ajFmtPrintF(outfile, "%S,%S\n", head, line);
|
|
282 }
|
|
283
|
|
284 valid = ajFalse;
|
|
285 }
|
|
286 }
|
|
287 }
|
|
288
|
|
289 ajFileClose(&outfile);
|
|
290
|
|
291 ajStrDel(&restid);
|
|
292 ajStrDel(&seqid);
|
|
293 ajStrDel(&inseq);
|
|
294 }
|
|
295
|
|
296 ajSeqallDel(&seqall);
|
|
297 ajSeqDel(&seq);
|
|
298 ajStrDel(&access);
|
|
299 ajStrDel(&gene);
|
|
300
|
|
301 embExit();
|
|
302 }
|