compute image inputs using config

huggingface · Feb 14, 2025 · 12025e3 · 12025e3
1 parent cd3e6b6
commit 12025e3
Show file tree

Hide file tree

Showing 4 changed files with 31 additions and 16 deletions.
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
@@ -431,7 +431,7 @@ class StoreAttr(object):
             logger.info(f"Automatic task detection to {task}{possible_synonyms}.")
 
         preprocessors = load_preprocessors(
-            model_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code
+            model_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code, model_type=model_type
         )
 
         submodel_paths = export_from_model(

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
@@ -617,8 +617,7 @@ def export_from_model(
         )
 
     library_name = _infer_library_from_model_or_model_class(model, library_name=library_name)
-    if library_name != "open_clip":
-        TasksManager.standardize_model_attributes(model, library_name=library_name)
+    TasksManager.standardize_model_attributes(model, library_name=library_name)
 
     if hasattr(model.config, "export_model_type"):
         model_type = model.config.export_model_type.replace("_", "-")

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -14,6 +14,7 @@
 
 import enum
 import importlib
+import math
 from copy import deepcopy
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
@@ -2862,17 +2863,31 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
                 dtype=int_dtype,
             )
         if input_name == "code_b":
+            # default value from https://github.com/deepseek-ai/Janus/blob/1daa72fa409002d40931bd7b36a9280362469ead/janus/models/vq_model.py#L42
+            z_channels = getattr(self.normalized_config.config.params, "z_channels", 256)
+            patch_size = int(math.sqrt(z_channels))
+            # default value from https://github.com/deepseek-ai/Janus/blob/1daa72fa409002d40931bd7b36a9280362469ead/generation_inference.py#L63
+            generated_image_size = getattr(self.normalized_config.config.params, "img_size", 384)
+            latent_heigh = int(generated_image_size // patch_size)
+            latent_width = int(generated_image_size // patch_size)
             return self.random_int_tensor(
-                [self.batch_size, 576],
+                [self.batch_size, int(latent_heigh * latent_width)],
                 max_value=self.normalized_config.config.params.image_token_size,
                 framework=framework,
                 dtype=int_dtype,
             )
         if input_name == "image_shape":
             import torch
+            # default value from https://github.com/deepseek-ai/Janus/blob/1daa72fa409002d40931bd7b36a9280362469ead/janus/models/vq_model.py#L42
+            z_channels = getattr(self.normalized_config.config.params, "z_channels", 256)
+            patch_size = int(math.sqrt(z_channels))
+            # default value from https://github.com/deepseek-ai/Janus/blob/1daa72fa409002d40931bd7b36a9280362469ead/generation_inference.py#L63
+            generated_image_size = getattr(self.normalized_config.config.params, "img_size", 384)
+            latent_heigh = int(generated_image_size // patch_size)
+            latent_width = int(generated_image_size // patch_size)
 
             return torch.tensor(
-                [self.batch_size, self.normalized_config.config.params.n_embed, 24, 24], dtype=torch.int64
+                [self.batch_size, self.normalized_config.config.params.n_embed, latent_heigh, latent_width], dtype=torch.int64
             )
         if input_name == "hidden_state":
             return self.random_float_tensor(

diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py
@@ -315,18 +315,19 @@ def save_preprocessors(
         maybe_save_preprocessors(model_name_or_path, output, trust_remote_code=trust_remote_code)
 
 
-def load_preprocessors(src_name_or_path: Union[str, Path], subfolder: str = "", trust_remote_code: bool = False):
+def load_preprocessors(src_name_or_path: Union[str, Path], subfolder: str = "", trust_remote_code: bool = False, model_type: str = None):
     preprocessors = maybe_load_preprocessors(
         src_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code
     )
-    if importlib.util.find_spec("janus") is not None:
-        from janus.models import VLChatProcessor
-
-        try:
-            processor = VLChatProcessor.from_pretrained(
-                src_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code
-            )
-            preprocessors.append(processor)
-        except Exception:
-            pass
+    if model_type == "janus":
+        if importlib.util.find_spec("janus") is not None:
+            from janus.models import VLChatProcessor
+
+            try:
+                processor = VLChatProcessor.from_pretrained(
+                    src_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code
+                )
+                preprocessors.append(processor)
+            except Exception:
+                pass
     return preprocessors