Mercurial > repos > bgruening > json2yolosegment
annotate preprocessing.py @ 5:ce7a96be8cb6 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit c6c9d43a4ecdc88ebdeaf3451453a550f159c506
| author | bgruening | 
|---|---|
| date | Mon, 21 Jul 2025 15:52:12 +0000 | 
| parents | 7db48c618bbe | 
| children | 
| rev | line source | 
|---|---|
| 0 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 1 import argparse | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 2 import os | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 3 | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 4 from sklearn.model_selection import train_test_split | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 5 | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 6 | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 7 def get_basename(f): | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 8 return os.path.splitext(os.path.basename(f))[0] | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 9 | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 10 | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 11 def pair_files(images_dir, labels_dir): | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 12 | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 13 img_files = [f for f in os.listdir(images_dir)] | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 14 lbl_files = [f for f in os.listdir(labels_dir)] | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 15 | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 16 image_dict = {get_basename(f): f for f in img_files} | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 17 label_dict = {get_basename(f): f for f in lbl_files} | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 18 | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 19 keys = sorted(set(image_dict) & set(label_dict)) | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 20 | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 21 return [(image_dict[k], label_dict[k]) for k in keys] | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 22 | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 23 | 
| 4 
7db48c618bbe
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 743c8acf1ea4e4b1e718743d3772b7e592646611
 bgruening parents: 
0diff
changeset | 24 def copy_file(src, dst): | 
| 
7db48c618bbe
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 743c8acf1ea4e4b1e718743d3772b7e592646611
 bgruening parents: 
0diff
changeset | 25 with open(src, 'rb') as fsrc, open(dst, 'wb') as fdst: | 
| 
7db48c618bbe
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 743c8acf1ea4e4b1e718743d3772b7e592646611
 bgruening parents: 
0diff
changeset | 26 while True: | 
| 
7db48c618bbe
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 743c8acf1ea4e4b1e718743d3772b7e592646611
 bgruening parents: 
0diff
changeset | 27 chunk = fsrc.read(8192) | 
| 
7db48c618bbe
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 743c8acf1ea4e4b1e718743d3772b7e592646611
 bgruening parents: 
0diff
changeset | 28 if not chunk: | 
| 
7db48c618bbe
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 743c8acf1ea4e4b1e718743d3772b7e592646611
 bgruening parents: 
0diff
changeset | 29 break | 
| 
7db48c618bbe
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 743c8acf1ea4e4b1e718743d3772b7e592646611
 bgruening parents: 
0diff
changeset | 30 fdst.write(chunk) | 
| 
7db48c618bbe
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 743c8acf1ea4e4b1e718743d3772b7e592646611
 bgruening parents: 
0diff
changeset | 31 | 
| 
7db48c618bbe
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 743c8acf1ea4e4b1e718743d3772b7e592646611
 bgruening parents: 
0diff
changeset | 32 | 
| 0 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 33 def copy_pairs(pairs, image_src, label_src, image_dst, label_dst): | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 34 os.makedirs(image_dst, exist_ok=True) | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 35 os.makedirs(label_dst, exist_ok=True) | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 36 for img, lbl in pairs: | 
| 4 
7db48c618bbe
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 743c8acf1ea4e4b1e718743d3772b7e592646611
 bgruening parents: 
0diff
changeset | 37 copy_file(os.path.join(image_src, img), os.path.join(image_dst, img)) | 
| 
7db48c618bbe
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 743c8acf1ea4e4b1e718743d3772b7e592646611
 bgruening parents: 
0diff
changeset | 38 copy_file(os.path.join(label_src, lbl), os.path.join(label_dst, lbl)) | 
| 0 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 39 | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 40 | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 41 def write_yolo_yaml(output_dir): | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 42 | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 43 yolo_yaml_path = os.path.join(output_dir, "yolo.yml") | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 44 with open(yolo_yaml_path, 'w') as f: | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 45 f.write(f"path: {output_dir}\n") | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 46 f.write("train: train\n") | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 47 f.write("val: valid\n") | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 48 f.write("test: test\n") | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 49 f.write("\n") | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 50 f.write("names: ['dataset']\n") | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 51 | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 52 | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 53 def main(): | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 54 parser = argparse.ArgumentParser() | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 55 parser.add_argument("-i", "--images", required=True) | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 56 parser.add_argument("-y", "--labels", required=True) | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 57 parser.add_argument("-o", "--output", required=True) | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 58 parser.add_argument("-p", "--train_percent", type=int, default=70) | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 59 args = parser.parse_args() | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 60 | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 61 all_pairs = pair_files(args.images, args.labels) | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 62 train_size = args.train_percent / 100.0 | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 63 val_test_size = 1.0 - train_size | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 64 | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 65 train_pairs, val_test_pairs = train_test_split(all_pairs, test_size=val_test_size, random_state=42) | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 66 val_pairs, test_pairs = train_test_split(val_test_pairs, test_size=0.5, random_state=42) | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 67 | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 68 copy_pairs(train_pairs, args.images, args.labels, os.path.join(args.output, "train/images"), os.path.join(args.output, "train/labels")) | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 69 copy_pairs(val_pairs, args.images, args.labels, os.path.join(args.output, "valid/images"), os.path.join(args.output, "valid/labels")) | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 70 copy_pairs(test_pairs, args.images, args.labels, os.path.join(args.output, "test/images"), os.path.join(args.output, "test/labels")) | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 71 | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 72 write_yolo_yaml(args.output) | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 73 | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 74 | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 75 if __name__ == "__main__": | 
| 
fa068d13e781
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools commit 67e0e1d123bcfffb10bab8cc04ae67259caec557
 bgruening parents: diff
changeset | 76 main() | 
