annotate preprocessing.py @ 4:d77c67cfe2ca draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit c6c9d43a4ecdc88ebdeaf3451453a550f159c506
author bgruening
date Mon, 21 Jul 2025 15:51:37 +0000
parents e708a1191202
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
1 import argparse
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
2 import os
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
3
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
4 from sklearn.model_selection import train_test_split
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
5
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
6
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
7 def get_basename(f):
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
8 return os.path.splitext(os.path.basename(f))[0]
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
9
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
10
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
11 def pair_files(images_dir, labels_dir):
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
12
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
13 img_files = [f for f in os.listdir(images_dir)]
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
14 lbl_files = [f for f in os.listdir(labels_dir)]
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
15
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
16 image_dict = {get_basename(f): f for f in img_files}
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
17 label_dict = {get_basename(f): f for f in lbl_files}
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
18
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
19 keys = sorted(set(image_dict) & set(label_dict))
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
20
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
21 return [(image_dict[k], label_dict[k]) for k in keys]
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
22
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
23
3
e708a1191202 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 743c8acf1ea4e4b1e718743d3772b7e592646611
bgruening
parents: 0
diff changeset
24 def copy_file(src, dst):
e708a1191202 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 743c8acf1ea4e4b1e718743d3772b7e592646611
bgruening
parents: 0
diff changeset
25 with open(src, 'rb') as fsrc, open(dst, 'wb') as fdst:
e708a1191202 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 743c8acf1ea4e4b1e718743d3772b7e592646611
bgruening
parents: 0
diff changeset
26 while True:
e708a1191202 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 743c8acf1ea4e4b1e718743d3772b7e592646611
bgruening
parents: 0
diff changeset
27 chunk = fsrc.read(8192)
e708a1191202 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 743c8acf1ea4e4b1e718743d3772b7e592646611
bgruening
parents: 0
diff changeset
28 if not chunk:
e708a1191202 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 743c8acf1ea4e4b1e718743d3772b7e592646611
bgruening
parents: 0
diff changeset
29 break
e708a1191202 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 743c8acf1ea4e4b1e718743d3772b7e592646611
bgruening
parents: 0
diff changeset
30 fdst.write(chunk)
e708a1191202 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 743c8acf1ea4e4b1e718743d3772b7e592646611
bgruening
parents: 0
diff changeset
31
e708a1191202 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 743c8acf1ea4e4b1e718743d3772b7e592646611
bgruening
parents: 0
diff changeset
32
0
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
33 def copy_pairs(pairs, image_src, label_src, image_dst, label_dst):
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
34 os.makedirs(image_dst, exist_ok=True)
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
35 os.makedirs(label_dst, exist_ok=True)
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
36 for img, lbl in pairs:
3
e708a1191202 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 743c8acf1ea4e4b1e718743d3772b7e592646611
bgruening
parents: 0
diff changeset
37 copy_file(os.path.join(image_src, img), os.path.join(image_dst, img))
e708a1191202 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 743c8acf1ea4e4b1e718743d3772b7e592646611
bgruening
parents: 0
diff changeset
38 copy_file(os.path.join(label_src, lbl), os.path.join(label_dst, lbl))
0
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
39
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
40
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
41 def write_yolo_yaml(output_dir):
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
42
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
43 yolo_yaml_path = os.path.join(output_dir, "yolo.yml")
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
44 with open(yolo_yaml_path, 'w') as f:
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
45 f.write(f"path: {output_dir}\n")
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
46 f.write("train: train\n")
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
47 f.write("val: valid\n")
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
48 f.write("test: test\n")
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
49 f.write("\n")
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
50 f.write("names: ['dataset']\n")
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
51
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
52
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
53 def main():
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
54 parser = argparse.ArgumentParser()
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
55 parser.add_argument("-i", "--images", required=True)
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
56 parser.add_argument("-y", "--labels", required=True)
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
57 parser.add_argument("-o", "--output", required=True)
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
58 parser.add_argument("-p", "--train_percent", type=int, default=70)
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
59 args = parser.parse_args()
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
60
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
61 all_pairs = pair_files(args.images, args.labels)
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
62 train_size = args.train_percent / 100.0
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
63 val_test_size = 1.0 - train_size
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
64
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
65 train_pairs, val_test_pairs = train_test_split(all_pairs, test_size=val_test_size, random_state=42)
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
66 val_pairs, test_pairs = train_test_split(val_test_pairs, test_size=0.5, random_state=42)
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
67
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
68 copy_pairs(train_pairs, args.images, args.labels, os.path.join(args.output, "train/images"), os.path.join(args.output, "train/labels"))
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
69 copy_pairs(val_pairs, args.images, args.labels, os.path.join(args.output, "valid/images"), os.path.join(args.output, "valid/labels"))
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
70 copy_pairs(test_pairs, args.images, args.labels, os.path.join(args.output, "test/images"), os.path.join(args.output, "test/labels"))
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
71
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
72 write_yolo_yaml(args.output)
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
73
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
74
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
75 if __name__ == "__main__":
af3eed1f1abe planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
bgruening
parents:
diff changeset
76 main()