levy-tech-spark
diff --git a/‎config.py‎
Lines changed: 3 additions & 1 deletion b/‎config.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎configs/test_config.yaml‎
Lines changed: 20 additions & 0 deletions b/‎configs/test_config.yaml‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎config.yaml‎ renamed to ‎configs/train_config.yaml‎
Lines changed: 2 additions & 2 deletions b/‎config.yaml‎ renamed to ‎configs/train_config.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎groundingdino/util/inference.py‎
Lines changed: 18 additions & 6 deletions b/‎groundingdino/util/inference.py‎
Lines changed: 18 additions & 6 deletions
diff --git a/‎groundingdino/util/lora.py‎
Lines changed: 0 additions & 1 deletion b/‎groundingdino/util/lora.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎test.py‎
Lines changed: 14 additions & 11 deletions b/‎test.py‎
Lines changed: 14 additions & 11 deletions
diff --git a/‎train.py‎
Lines changed: 3 additions & 3 deletions b/‎train.py‎
Lines changed: 3 additions & 3 deletions
@@ -27,12 +27,14 @@ def from_dict(cls, data: Dict[str, Any]) -> 'DataConfig':
 class ModelConfig:
     config_path: str
     weights_path: str
+    lora_weigths: str = None
 
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> 'ModelConfig':
         return cls(
             config_path=str(data['config_path']),
-            weights_path=str(data['weights_path'])
+            weights_path=str(data['weights_path']),
+            lora_weigths=str(data.get('lora_weights', None)),
         )
 
 @dataclass
 
@@ -0,0 +1,20 @@
+data:
+  batch_size: 4
+  num_workers: 8
+  train_ann: multimodal-data/fashion_dataset_subset/train_annotations.csv
+  train_dir: multimodal-data/fashion_dataset_subset/images/train
+  val_ann: multimodal-data/fashion_dataset_subset/val_annotations.csv
+  val_dir: multimodal-data/fashion_dataset_subset/images/val
+model:
+  config_path: groundingdino/config/GroundingDINO_SwinT_OGC.py
+  lora_weigths: None
+  weights_path: weights/groundingdino_swint_ogc.pth
+training:
+  learning_rate: 0.0001
+  num_epochs: 200
+  save_dir: weights
+  save_frequency: 5
+  use_lora: true
+  use_lora_layers: true
+  visualization_frequency: 5
+  warmup_epochs: 5
@@ -12,9 +12,9 @@ model:
 
 training:
   num_epochs: 200
-  learning_rate: 1e-3
+  learning_rate: 1e-4
   save_dir: "weights"
-  save_frequency: 100
+  save_frequency: 5
   warmup_epochs: 5
   use_lora: true
   use_lora_layers: true # This applies lora to only bbox pred layer and few transformer decoder layers the number of trainable parameters in this case will be < 1% of total parameters
 
@@ -18,6 +18,7 @@
 from groundingdino.util.class_loss import FocalLoss
 import os
 from groundingdino.util.box_ops import box_cxcywh_to_xyxy
+from config import ModelConfig
 
 # ----------------------------------------------------------------------------------------------------------------------
 # OLD API
@@ -31,16 +32,27 @@ def preprocess_caption(caption: str) -> str:
     return result + "."
 
 
-def load_model(model_config_path: str, model_checkpoint_path: str, device: str = "cuda",strict: bool =True):
-    args = SLConfig.fromfile(model_config_path)
-    args.device = device
-    model = build_model(args)
-    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
+def load_weights(model:torch.nn.Module,checkpoint:dict):
     if "model" in checkpoint.keys():
         model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
     else:
         # The state dict is the checkpoint
-        model.load_state_dict(clean_state_dict(checkpoint), strict=True)
+        model.load_state_dict(clean_state_dict(checkpoint), strict=False)
+
+
+def load_model(model_config:ModelConfig, use_lora:bool= False, device: str = "cuda",strict: bool =True):
+    args = SLConfig.fromfile(model_config.config_path)
+    args.device = device
+    model = build_model(args)
+    # Loading main weights if lora is not used these are the only one required
+    checkpoint = torch.load(model_config.weights_path, map_location="cpu")
+    print(f"Loading main model Weights!!")
+    load_weights(model,checkpoint)
+    if use_lora:
+        print(f"Loading Lora Weights!!")
+        checkpoint = torch.load(model_config.lora_weigths, map_location="cpu")
+        load_weights(model,checkpoint)
+
     model.eval()
     return model
 
 
@@ -65,7 +65,6 @@ def add_lora_to_model(model, rank=8):
             "key",
             "value",
             "dense",    
-            "bbox_embed",
         ],
         lora_dropout=0.1,
         bias="none",
 
@@ -1,10 +1,10 @@
 from groundingdino.util.inference import load_model, load_image, predict, annotate
-import cv2
 import torch
 import torchvision.ops as ops
 import os
 from torchvision.ops import box_convert
 from groundingdino.util.inference import GroundingDINOVisualizer
+from config import ConfigurationManager, DataConfig, ModelConfig
 
 def apply_nms_per_phrase(image_source, boxes, logits, phrases, threshold=0.3):
     h, w, _ = image_source.shape
@@ -28,19 +28,17 @@ def apply_nms_per_phrase(image_source, boxes, logits, phrases, threshold=0.3):
     return torch.stack(nms_boxes_list), torch.stack(nms_logits_list), nms_phrases_list
 
 
-def process_image(
-        model_config="groundingdino/config/GroundingDINO_SwinT_OGC.py",
-        model_weights="weights/groundingdino_swint_ogc.pth",
-        image_path="multimodal-data/fashion_dataset_subset/images/val/val_000004.jpg",
-        text_prompt="shirt .bag .pants",
+def process_images(
+        model,
+        text_prompt,
+        data_config,
         box_threshold=0.35,
         text_threshold=0.25
 ):
-    model = load_model(model_config, model_weights)
     visualizer = GroundingDINOVisualizer(save_dir="visualizations")
 
-    for img in os.listdir('multimodal-data/fashion_dataset_subset/images/val'):
-        image_path=os.path.join('multimodal-data/fashion_dataset_subset/images/val',img)
+    for img in os.listdir(data_config.val_dir):
+        image_path=os.path.join(data_config.val_dir,img)
         image_source, image = load_image(image_path)
         visualizer.visualize_image(model,image,text_prompt,image_source,img)
 
@@ -60,5 +58,10 @@ def process_image(
 
 
 if __name__ == "__main__":
-    model_weights="weights/groundingdino_swint_ogc.pth"
-    process_image(model_weights=model_weights)
+    # Config file of the prediction, the model weights can be complete model weights but if use_lora is true then lora_wights should also be present see example
+    ## config file
+    config_path="configs/test_config.yaml"
+    text_prompt="shirt .bag .pants",
+    data_config, model_config, training_config = ConfigurationManager.load_config(config_path)
+    model = load_model(model_config,training_config.use_lora)
+    process_images(model,text_prompt,data_config)
@@ -222,14 +222,14 @@ def save_checkpoint(self, path, epoch, losses, use_lora=False):
         if use_lora:
             checkpoint = {
             'epoch': epoch,
-            'model_state_dict': get_lora_weights(self.model),
+            'model': get_lora_weights(self.model),
             'optimizer_state_dict': self.optimizer.state_dict(),
             'scheduler_state_dict': self.scheduler.state_dict() if self.scheduler else None,
             'losses': losses,}
         else:
             checkpoint = {
                 'epoch': epoch,
-                'model_state_dict': self.model.state_dict(),
+                'model': self.model.state_dict(),
                 'ema_state_dict': self.ema_model.state_dict() if self.use_ema else None,
                 'optimizer_state_dict': self.optimizer.state_dict(),
                 'scheduler_state_dict': self.scheduler.state_dict() if self.scheduler else None,
@@ -322,4 +322,4 @@ def train(config_path: str, save_dir: Optional[str] = None) -> None:
 
 
 if __name__ == "__main__":
-    train('config.yaml')
+    train('configs/train_config.yaml')