0
|
1 """
|
|
2 Copyright (C) 2013, Pieter Lukasse, Plant Research International, Wageningen
|
|
3
|
|
4 Licensed under the Apache License, Version 2.0 (the "License");
|
|
5 you may not use this software except in compliance with the License.
|
|
6 You may obtain a copy of the License at
|
|
7
|
|
8 http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
|
10 Unless required by applicable law or agreed to in writing, software
|
|
11 distributed under the License is distributed on an "AS IS" BASIS,
|
|
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13 See the License for the specific language governing permissions and
|
|
14 limitations under the License.
|
|
15
|
|
16 """
|
|
17
|
|
18 import sys
|
|
19 import pdfread
|
|
20 from subprocess import call
|
|
21
|
|
22
|
|
23 def convert_pdftotext(filename, output_file):
|
|
24 '''
|
|
25 Converts PDF file to text
|
|
26 @param filename: PDF file to parse
|
|
27 @param output_file: output text file for the hits
|
|
28 '''
|
|
29
|
|
30 try:
|
|
31 call(["pdftotext", filename, output_file])
|
|
32 except:
|
|
33 raise Exception("Error while trying to convert PDF to text")
|
|
34
|
|
35
|
|
36
|
|
37
|
|
38 if __name__ == '__main__':
|
|
39 pdf_as_text = sys.argv[1]+".txt"
|
|
40 convert_pdftotext(sys.argv[1], pdf_as_text)
|
|
41 pdfread.convert_pdftotext2tabular(pdf_as_text, sys.argv[2], sys.argv[3], False)
|