diff --git a/mistralrs-core/src/vision_models/phi3_inputs_processor.rs b/mistralrs-core/src/vision_models/phi3_inputs_processor.rs index 6b66af3bc9..4a80562320 100644 --- a/mistralrs-core/src/vision_models/phi3_inputs_processor.rs +++ b/mistralrs-core/src/vision_models/phi3_inputs_processor.rs @@ -379,6 +379,15 @@ impl ImagePreProcessor for Phi3InputsProcessor { let hd_image = Self::hd_transform(image, config.num_crops.expect("Need `num_crops`")); + let transforms_hd2 = Transforms { + input: &ToTensor, + inner_transforms: &[], + }; + + // (3,h,w) + let hd_image2 = hd_image.apply(transforms_hd2, device)?; + dbg!(hd_image2); + // Both hd and global have a normalization // Transforms for the HD image let transforms_hd = Transforms { @@ -394,7 +403,7 @@ impl ImagePreProcessor for Phi3InputsProcessor { // Resize with bicubic interpolation // (3,336,336) - let global_image = hd_image.unsqueeze(0)?.interpolate2d(336, 336)?.squeeze(0)?; + let global_image = hd_image.unsqueeze(0)?.interpolate2d(336, 336)?; let (_, h, w) = hd_image.dims3()?; let num_image_tokens = ((h as f32 / 336. * w as f32 / 336. + 1.) * 144. diff --git a/mistralrs-vision/src/transforms.rs b/mistralrs-vision/src/transforms.rs index a4aeb27961..f6c462a5a1 100644 --- a/mistralrs-vision/src/transforms.rs +++ b/mistralrs-vision/src/transforms.rs @@ -20,7 +20,7 @@ impl ToTensor { ) } let row = Tensor::cat(&row_accum, 0)?; - accum.push(row.reshape((row.dim(1)?, ()))?.unsqueeze(1)?); + accum.push(row.t()?.unsqueeze(1)?); } let t = Tensor::cat(&accum, 1)?.to_device(device)?; // Rescale to between 0 and 1