0
|
1 #####################################################################################
|
|
2 # curate_csv_with_worms.yaml
|
|
3 #####################################################################################
|
|
4 #
|
|
5 # Clean data provided via standard input and print results to standard output:
|
|
6 # ka curate_csv_with_worms.yaml < ../data/five_records.csv
|
|
7 #
|
|
8 # Clean data in a named input file and print results to standard output:
|
|
9 # ka curate_csv_with_worms.yaml -p input=../data/five_records.csv
|
|
10 #
|
|
11 # Clean data in a named input file and save results to named output file:
|
|
12 # ka curate_csv_with_worms.yaml -p input=../data/five_records.csv -p output=output.csv
|
|
13 #
|
|
14 #####################################################################################
|
|
15
|
|
16 imports:
|
|
17
|
|
18 - classpath:/org/kurator/akka/types.yaml
|
|
19
|
|
20 components:
|
|
21
|
|
22 - id: ReadInput
|
|
23 type: PythonActor
|
|
24 properties:
|
|
25 code: |
|
|
26 import csv
|
|
27 import sys
|
|
28 def on_start(options):
|
|
29 filePath = options.get('inputFile')
|
|
30 f = open(filePath, 'r') if filePath is not None else sys.stdin
|
|
31 for record in csv.DictReader(f):
|
|
32 yield record
|
|
33
|
|
34 - id: CleanRecords
|
|
35 type: PythonClassActor
|
|
36 properties:
|
|
37 pythonClass: kurator_worms.record_curator.RecordCurator
|
|
38 onData: clean_record
|
|
39 parameters:
|
|
40 taxon_name_field : 'TaxonName'
|
|
41 author_field : 'Author'
|
|
42 original_taxon_name_field : 'OriginalTaxonName'
|
|
43 original_author_field : 'OriginalAuthor'
|
|
44 match_type_field : 'WoRMSMatchType'
|
|
45 lsid_field : 'LSID'
|
|
46 fuzzy_match_enabled : 'True'
|
|
47 listensTo:
|
|
48 - !ref ReadInput
|
|
49
|
|
50 - id: WriteOutput
|
|
51 type: PythonClassActor
|
|
52 properties:
|
|
53 pythonClass: CsvWriter
|
|
54 code: |
|
|
55 import csv
|
|
56 import os
|
|
57 import sys
|
|
58 class CsvWriter(object):
|
|
59 def on_start(self, options):
|
|
60 filePath = options.get('outputFile')
|
|
61 self.f = open(filePath, 'w') if filePath is not None else sys.stdout
|
|
62 self.dw = csv.DictWriter(self.f,
|
|
63 fieldnames=options['fieldnames'],
|
|
64 quotechar="'",
|
|
65 lineterminator=os.linesep)
|
|
66 self.dw.writeheader()
|
|
67 def on_data(self, record):
|
|
68 self.dw.writerow(record)
|
|
69 sys.stdout.flush()
|
|
70 parameters:
|
|
71 fieldnames: [ID,TaxonName,Author,OriginalTaxonName,OriginalAuthor,WoRMSMatchType,LSID]
|
|
72 listensTo:
|
|
73 - !ref CleanRecords
|
|
74
|
|
75 - id: ValidateNamesWithWoRMSWorkflow
|
|
76 type: Workflow
|
|
77 properties:
|
|
78 actors:
|
|
79 - !ref ReadInput
|
|
80 - !ref CleanRecords
|
|
81 - !ref WriteOutput
|
|
82 parameters:
|
|
83 input:
|
|
84 actor: !ref ReadInput
|
|
85 parameter: inputFile
|
|
86 output:
|
|
87 actor: !ref WriteOutput
|
|
88 parameter: outputFile
|