@@ -118,12 +118,94 @@ void VisionPipelineState::SetExtraInputs(const std::vector<ExtraInput>& extra_in
118118 image_features_ = std::make_unique<MultiModalFeatures>(*this , MultiModalFeatures::Mode::Output, // model output
119119 model_.config_ ->model .vision .outputs .image_features ,
120120 num_images_, num_image_tokens_);
121- image_features_->Add ();
121+ for (const auto & ei : extra_inputs) {
122+ if (ei.name == " pixel_values" ) {
123+ pixel_values_tensor_ = ei.tensor ;
124+ break ;
125+ }
126+ }
122127 extra_inputs_.Add (extra_inputs, model_.vision_session_ ->GetInputNames ());
123128}
124129
125- DeviceSpan<float > VisionPipelineState::Run (int current_length, DeviceSpan<int32_t >& next_tokens, DeviceSpan<int32_t > next_indices) {
126- State::Run (*model_.vision_session_ );
130+ // Create a [1, C, H, W] tensor and copy the i-th image from a [N, C, H, W] tensor.
131+ static std::shared_ptr<Tensor> MakeSingleImagePixelValues (const std::shared_ptr<Tensor>& full,
132+ int64_t index,
133+ DeviceInterface* device) {
134+ if (!full || !full->GetOrtTensor ()) {
135+ throw std::runtime_error (" MakeSingleImagePixelValues: source tensor is null" );
136+ }
137+ const auto full_shape = full->GetShape (); // expected [N, C, H, W]
138+ if (full_shape.size () != 4 ) {
139+ throw std::runtime_error (" MakeSingleImagePixelValues: expected [N, C, H, W] shape" );
140+ }
141+ const int64_t N = full_shape[0 ];
142+ const int64_t C = full_shape[1 ];
143+ const int64_t H = full_shape[2 ];
144+ const int64_t W = full_shape[3 ];
145+ if (index < 0 || index >= N) {
146+ throw std::runtime_error (" MakeSingleImagePixelValues: index out of range" );
147+ }
148+
149+ // Destination shape [1, C, H, W]
150+ std::vector<int64_t > dst_shape = {1 , C, H, W};
151+
152+ auto dst = std::make_shared<Tensor>(device, full->GetType ());
153+ dst->CreateTensor (dst_shape, /* make_static=*/ false );
154+
155+ // Compute byte ranges and copy
156+ const size_t elem_size = Ort::SizeOf (full->GetType ());
157+ const size_t per_image_bytes = static_cast <size_t >(C) * static_cast <size_t >(H) * static_cast <size_t >(W) * elem_size;
158+ const size_t offset_bytes = static_cast <size_t >(index) * per_image_bytes;
159+
160+ auto src_bytes = full->GetByteSpan ();
161+ auto dst_bytes = dst->GetByteSpan ();
162+
163+ if (offset_bytes + per_image_bytes > src_bytes.size () || per_image_bytes > dst_bytes.size ()) {
164+ throw std::runtime_error (" MakeSingleImagePixelValues: copy bounds exceeded" );
165+ }
166+
167+ dst_bytes.CopyFrom (src_bytes.subspan (offset_bytes, per_image_bytes));
168+ return dst;
169+ }
170+
171+ DeviceSpan<float > VisionPipelineState::Run (int current_length,
172+ DeviceSpan<int32_t >& next_tokens,
173+ DeviceSpan<int32_t > next_indices) {
174+ if (!model_.vision_session_ || !image_features_ || !pixel_values_tensor_) {
175+ return {};
176+ }
177+
178+ const int64_t total_images = num_images_;
179+ const size_t bytes_per_image = image_features_->BytesPerImage ();
180+
181+ // Flat destination bytes of the global features buffer
182+ auto dst_all_bytes = image_features_->AsByteSpan ();
183+
184+ // Bind a single-image output features object once and reuse across runs
185+ std::unique_ptr<MultiModalFeatures> run_features =
186+ std::make_unique<MultiModalFeatures>(*this ,
187+ MultiModalFeatures::Mode::Output,
188+ model_.config_ ->model .vision .outputs .image_features ,
189+ /* batch_size=*/ 1 ,
190+ /* num_feature_tokens=*/ num_image_tokens_);
191+ run_features->Add ();
192+
193+ for (int64_t i = 0 ; i < total_images; ++i) {
194+ auto pixel_values_i = MakeSingleImagePixelValues (pixel_values_tensor_, i, model_.p_device_ );
195+ extra_inputs_.Replace (" pixel_values" , pixel_values_i);
196+
197+ State::Run (*model_.vision_session_ );
198+
199+ auto src_bytes = run_features->AsByteSpan ();
200+
201+ const size_t dst_offset = static_cast <size_t >(i) * bytes_per_image;
202+ if (dst_offset + bytes_per_image <= dst_all_bytes.size () && bytes_per_image <= src_bytes.size ()) {
203+ dst_all_bytes.subspan (dst_offset, bytes_per_image).CopyFrom (src_bytes.subspan (0 , bytes_per_image));
204+ } else {
205+ throw std::runtime_error (" VisionPipelineState::Run: features copy out of bounds" );
206+ }
207+ }
208+
127209 return {};
128210}
129211
@@ -648,6 +730,7 @@ DeviceSpan<float> MultiModalDecoderPipelineState::Run(int current_length, Device
648730 if (num_audio_tokens_ > 0 && speech_state_) {
649731 speech_state_->Run (current_length, next_tokens, next_indices);
650732 }
733+ vision_state_->image_features_ ->Add ();
651734 if (vision_state_) embedding_state_->image_features_ ->ReuseFeaturesBuffer (*vision_state_->image_features_ );
652735 if (speech_state_) embedding_state_->audio_features_ ->ReuseFeaturesBuffer (*speech_state_->audio_features_ );
653736 embedding_state_->inputs_embeds_ .ReuseEmbeddingsBuffer (decoder_pipeline_state_->full_inputs_embeds_ );
0 commit comments