-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
132 lines (99 loc) · 4.65 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import torch
# import os
import logging
from detectron2 import model_zoo
# from detectron2.engine import DefaultTrainer
from detectron2.config import get_cfg
from pathlib import Path
from detectron2.data.datasets import register_coco_instances
from detectron2.engine import DefaultTrainer, hooks
from detectron2.utils.logger import setup_logger
from detectron2.utils.events import TensorboardXWriter
from detectron2.evaluation import COCOEvaluator
# Constants
MATERIAL = "NbSe2" #can be "WS2" or "NbSe2" or ... (other materials - later on)
TODAY_DATASET = "2025_01_13_newImages"
BASE_DIR = Path(__file__).resolve().parent / MATERIAL
TODAY_DATASET_FOLDER = BASE_DIR / "data" / TODAY_DATASET
TRAIN_JSON = "train_annotations.json"
TEST_JSON = "val_annotations.json"
MODEL_CONFIG = "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"
MODEL_WEIGHTS = model_zoo.get_checkpoint_url(MODEL_CONFIG)
# Setup directories
def setup_directories(base_path, subdirs):
dirs = {}
for subdir in subdirs:
dir_path = base_path / subdir
dir_path.mkdir(parents=True, exist_ok=True)
dirs[subdir] = dir_path
return dirs
dirs = setup_directories(BASE_DIR, ["logs", "checkpoints", "configs", "data"])
if not (TODAY_DATASET_FOLDER / TRAIN_JSON).exists():
raise FileNotFoundError(f"Training dataset JSON not found: {TRAIN_JSON}")
# if not (TODAY_DATASET_FOLDER / VAL_JSON).exists():
# raise FileNotFoundError(f"Validation dataset JSON not found: {VAL_JSON}")
if not (TODAY_DATASET_FOLDER / TEST_JSON).exists():
raise FileNotFoundError(f"Test dataset JSON not found: {TEST_JSON}")
# Dataset registration
def register_datasets():
register_coco_instances("dataset_train", {}, TODAY_DATASET_FOLDER / TRAIN_JSON, TODAY_DATASET_FOLDER)
# register_coco_instances("dataset_val", {}, TODAY_DATASET_FOLDER / VAL_JSON, TODAY_DATASET_FOLDER)
register_coco_instances("dataset_test", {}, TODAY_DATASET_FOLDER / TEST_JSON, TODAY_DATASET_FOLDER)
# Model configuration
def setup_cfg():
cfg = get_cfg()
cfg.OUTPUT_DIR = str(dirs["configs"])
cfg.LOG_DIR = str(dirs["logs"])
cfg.merge_from_file(model_zoo.get_config_file(MODEL_CONFIG))
cfg.DATASETS.TRAIN = ("dataset_train",)
# cfg.DATASETS.TEST = ()
cfg.DATASETS.TEST = ("dataset_test",)
cfg.TEST.EVAL_PERIOD = 20
cfg.DATALOADER.NUM_WORKERS = 1 # was 2
cfg.MODEL.WEIGHTS = MODEL_WEIGHTS
cfg.SOLVER.IMS_PER_BATCH = 1 # This is the real "batch size" commonly known to deep learning people
cfg.SOLVER.BASE_LR = 0.00001 # pick a good LR
cfg.SOLVER.MAX_ITER = 1000 # 1000 iterations seems good enough for this dataset
cfg.SOLVER.STEPS = [] # do not decay learning rate
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 256 # Default is 512, using 256 for this dataset.
if MATERIAL == "WS2":
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 3 # We have 3 classes for WS2: ML, BL, FL
elif MATERIAL == "NbSe2":
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 4 # We had 6 classes for NbSe2: 1L, 2L, 3L, 4L, 5L, >=5L; now 4 classes: 3L, 4L, 5L, >5L.
else: print("You have chosen the unsupported material. Choose one from the list: WS2, NbSe2.")
# NOTE: this config means the number of classes, without the background. Do not use num_classes+1 here.
cfg.MODEL.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
cfg.SOLVER.CLIP_GRADIENTS.ENABLED = True
cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE = "value"
cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE = 1.0 # Clip gradient values to prevent explosion
return cfg
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# logger.info(f"Training on {len(train_dataset_dicts)} images")
# Custom Trainer with TensorBoard support
class TrainerWithTensorboard(DefaultTrainer):
def build_writers(self):
# Build default writers (JSON and log file) and add TensorBoard
writers = super().build_writers()
writers.append(TensorboardXWriter(self.cfg.LOG_DIR))
return writers
@staticmethod
def build_evaluator(cfg, dataset_name, output_folder=None):
"""
Builds the evaluator for evaluation during training.
"""
if output_folder is None:
output_folder = Path(cfg.OUTPUT_DIR) / "eval"
return COCOEvaluator(dataset_name, cfg, False, output_dir=output_folder)
# Main script
if __name__ == "__main__":
# torch.cuda.set_per_process_memory_fraction(0.5, 0) # Limit to 50% of GPU memory
register_datasets()
cfg = setup_cfg()
# trainer = DefaultTrainer(cfg)
trainer = TrainerWithTensorboard(cfg)
try:
trainer.train()
except Exception as e:
logger.error(f"Training failed: {e}")
raise