@@ -388,7 +388,7 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
388388    const  int  n_layer              = hparams.n_layer ;
389389    const  float  eps                = hparams.eps ;
390390
391-     GGML_ASSERT (imgs.size () == 1 ); //  batch_size == 1
391+     GGML_ASSERT (imgs.entries . size () == 1 ); //  batch_size == 1
392392
393393    struct  ggml_init_params  params = {
394394        /* .mem_size   =*/ buf_compute_meta .size (),
@@ -540,16 +540,16 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
540540        image_size_width  = load_image_size.width ;
541541        image_size_height = load_image_size.height ;
542542        if  (is_inf) {
543-             image_size_width  = imgs[0 ]->nx ;
544-             image_size_height = imgs[0 ]->ny ;
543+             image_size_width  = imgs. entries [0 ]->nx ;
544+             image_size_height = imgs. entries [0 ]->ny ;
545545        }
546546    }
547547    else  if  (ctx->has_qwen2vl_merger ) {
548548        //  use the image's native resolution when image is avaible
549549        if  (is_inf) {
550550        //  if (imgs->data->nx && imgs->data->ny) {
551-             image_size_width  = imgs[0 ]->nx ;
552-             image_size_height = imgs[0 ]->ny ;
551+             image_size_width  = imgs. entries [0 ]->nx ;
552+             image_size_height = imgs. entries [0 ]->ny ;
553553        }
554554    }
555555    const  int  patch_size           = hparams.patch_size ;
@@ -564,7 +564,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
564564    const  float  eps                = hparams.eps ;
565565    int  mrope_sections[4 ] = {d_head/4 , d_head/4 , d_head/4 , d_head/4 };
566566
567-     const  int  batch_size = imgs.size ();
567+     const  int  batch_size = imgs.entries . size ();
568568
569569    if  (ctx->has_llava_projector  || ctx->has_minicpmv_projector  || ctx->has_glm_projector ) {
570570        GGML_ASSERT (batch_size == 1 );
@@ -1477,15 +1477,15 @@ struct clip_model_loader {
14771477
14781478        //  create a fake batch
14791479        clip_image_f32_batch batch;
1480-         clip_image_f32_ptr img;
1480+         clip_image_f32_ptr img ( clip_image_f32_init ()) ;
14811481        clip_image_size image_size;
14821482        image_size.width   = clip_get_image_size (&ctx_clip);
14831483        image_size.height  = clip_get_image_size (&ctx_clip);
14841484        int  n_patches = clip_get_image_size (&ctx_clip) / image_size.width ;
14851485        img->nx  = n_patches;
14861486        img->ny  = n_patches;
14871487        img->buf .resize (n_patches * image_size.width  * image_size.height  * 3 );
1488-         batch.push_back (std::move (img));
1488+         batch.entries . push_back (std::move (img));
14891489
14901490        ggml_cgraph * gf = clip_image_build_graph (&ctx_clip, batch, image_size, false );
14911491        ggml_backend_sched_reserve (ctx_clip.sched .get (), gf);
@@ -1626,31 +1626,31 @@ void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { if (batch) d
16261626void  clip_image_f32_batch_free (struct  clip_image_f32_batch  * batch) { if  (batch) delete  batch; }
16271627
16281628size_t  clip_image_f32_batch_n_images (const  struct  clip_image_f32_batch  * batch) {
1629-     return  batch->size ();
1629+     return  batch->entries . size ();
16301630}
16311631
16321632size_t  clip_image_f32_batch_nx (const  struct  clip_image_f32_batch  * batch, int  idx) {
1633-     if  (idx < 0  || idx >= (int )batch->size ()) {
1633+     if  (idx < 0  || idx >= (int )batch->entries . size ()) {
16341634        LOG_ERR (" %s: invalid index %d\n " 
16351635        return  0 ;
16361636    }
1637-     return  batch->at ( idx) ->nx ;
1637+     return  batch->entries [ idx] ->nx ;
16381638}
16391639
16401640size_t  clip_image_f32_batch_ny (const  struct  clip_image_f32_batch  * batch, int  idx) {
1641-     if  (idx < 0  || idx >= (int )batch->size ()) {
1641+     if  (idx < 0  || idx >= (int )batch->entries . size ()) {
16421642        LOG_ERR (" %s: invalid index %d\n " 
16431643        return  0 ;
16441644    }
1645-     return  batch->at ( idx) ->ny ;
1645+     return  batch->entries [ idx] ->ny ;
16461646}
16471647
16481648clip_image_f32 * clip_image_f32_get_img (const  struct  clip_image_f32_batch  * batch, int  idx) {
1649-     if  (idx < 0  || idx >= (int )batch->size ()) {
1649+     if  (idx < 0  || idx >= (int )batch->entries . size ()) {
16501650        LOG_ERR (" %s: invalid index %d\n " 
16511651        return  nullptr ;
16521652    }
1653-     return  batch->at ( idx) .get ();
1653+     return  batch->entries [ idx] .get ();
16541654}
16551655
16561656void  clip_build_img_from_pixels (const  unsigned  char  * rgb_pixels, int  nx, int  ny, clip_image_u8 * img) {
@@ -1884,7 +1884,7 @@ static std::vector<clip_image_u8_ptr> divide_to_patches_u8(const clip_image_u8 &
18841884    int  height = image.ny ;
18851885    for  (int  i = 0 ; i < height; i += patch_size) {
18861886        for  (int  j = 0 ; j < width; j += patch_size) {
1887-             clip_image_u8_ptr patch;
1887+             clip_image_u8_ptr patch ( clip_image_u8_init ()) ;
18881888            patch->nx  = std::min (patch_size, width - j);
18891889            patch->ny  = std::min (patch_size, height - i);
18901890            patch->buf .resize (3  * patch->nx  * patch->ny );
@@ -1990,14 +1990,14 @@ static std::vector<std::vector<clip_image_u8_ptr>> uhd_slice_image(const clip_im
19901990
19911991    if  (multiple <= 1 ) {
19921992        auto  best_size = uhd_find_best_resize (original_size, scale_resolution, patch_size, true );
1993-         clip_image_u8_ptr source_image;
1993+         clip_image_u8_ptr source_image ( clip_image_u8_init ()) ;
19941994        bicubic_resize (*img, *source_image, best_size.first , best_size.second );
19951995        //  source_image = image.resize(best_size, Image.Resampling.BICUBIC)
19961996        images.back ().push_back (std::move (source_image));
19971997    }
19981998    else  if  (multiple > 1 ) {
19991999        auto  best_size = uhd_find_best_resize (original_size, scale_resolution, patch_size);
2000-         clip_image_u8_ptr source_image;
2000+         clip_image_u8_ptr source_image ( clip_image_u8_init ()) ;
20012001        bicubic_resize (*img, *source_image, best_size.first , best_size.second );
20022002        //  source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
20032003        LOG_DBG (" %s: image_size: %d %d; source_image size: %d %d\n " nx , img->ny , best_size.first , best_size.second );
@@ -2007,7 +2007,7 @@ static std::vector<std::vector<clip_image_u8_ptr>> uhd_slice_image(const clip_im
20072007        LOG_DBG (" %s: image_size: %d %d; best_grid: %d %d\n " nx , img->ny , best_grid.first , best_grid.second );
20082008
20092009        auto  refine_size = uhd_get_refine_size (original_size, best_grid, scale_resolution, patch_size, true );
2010-         clip_image_u8_ptr refine_image;
2010+         clip_image_u8_ptr refine_image ( clip_image_u8_init ()) ;
20112011        bicubic_resize (*img, *refine_image, refine_size.first , refine_size.second );
20122012
20132013        LOG_DBG (" %s: refine_image_size: %d %d; refine_size: %d %d\n " nx , refine_image->ny , refine_size.first , refine_size.second );
@@ -2020,7 +2020,7 @@ static std::vector<std::vector<clip_image_u8_ptr>> uhd_slice_image(const clip_im
20202020        for  (int  patches_i = 0 , ic = 0 ; patches_i < height && ic < best_grid.second ; patches_i += grid_y, ic += 1 ){
20212021            images.push_back (std::vector<clip_image_u8_ptr>());
20222022            for (int  patches_j = 0 , jc = 0 ; patches_j < width && jc < best_grid.first ; patches_j += grid_x, jc += 1 ){
2023-                 clip_image_u8_ptr patch;
2023+                 clip_image_u8_ptr patch ( clip_image_u8_init ()) ;
20242024                patch->nx  = grid_x;
20252025                patch->ny  = grid_y;
20262026                patch->buf .resize (3  * patch->nx  * patch->ny );
@@ -2062,36 +2062,36 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
20622062        for  (size_t  i = 0 ; i < imgs.size (); ++i) {
20632063            for  (size_t  j = 0 ; j < imgs[i].size (); ++j) {
20642064                LOG_DBG (" %s: %d %d\n " nx ,imgs[i][j]->ny );
2065-                 clip_image_f32_ptr res;
2065+                 clip_image_f32_ptr res ( clip_image_f32_init ()) ;
20662066                normalize_image_u8_to_f32 (*imgs[i][j], *res, ctx->image_mean , ctx->image_std );
2067-                 res_imgs->push_back (std::move (res));
2067+                 res_imgs->entries . push_back (std::move (res));
20682068            }
20692069        }
20702070        return  true ;
20712071    }
20722072    else  if  (ctx->has_qwen2vl_merger ) {
2073-         clip_image_u8_ptr  resized;
2073+         clip_image_u8  resized;
20742074        auto  patch_size = clip_get_patch_size (ctx) * 2 ;
20752075        int  nx = ceil ((float )img->nx  / patch_size) * patch_size;
20762076        int  ny = ceil ((float )img->ny  / patch_size) * patch_size;
2077-         bicubic_resize (*img, * resized, nx, ny);
2077+         bicubic_resize (*img, resized, nx, ny);
20782078
2079-         clip_image_f32_ptr img_f32;
2080-         //  clip_image_f32_ptr res;
2081-         normalize_image_u8_to_f32 (* resized, *img_f32, ctx->image_mean , ctx->image_std );
2079+         clip_image_f32_ptr img_f32 ( clip_image_f32_init ()) ;
2080+         //  clip_image_f32_ptr res(clip_image_f32_init()) ;
2081+         normalize_image_u8_to_f32 (resized, *img_f32, ctx->image_mean , ctx->image_std );
20822082        //  res_imgs->data[0] = *res;
2083-         res_imgs->push_back (std::move (img_f32));
2083+         res_imgs->entries . push_back (std::move (img_f32));
20842084        return  true ;
20852085    }
20862086
20872087    if  (ctx->has_glm_projector  || ctx->proj_type  == PROJECTOR_TYPE_GEMMA3) {
20882088        clip_image_u8 resized_image;
20892089        int32_t  sz=ctx->vision_model .hparams .image_size ;
20902090        bicubic_resize (*img, resized_image,sz,sz);
2091-         clip_image_f32_ptr img_f32;
2091+         clip_image_f32_ptr img_f32 ( clip_image_f32_init ()) ;
20922092        // clip_image_save_to_bmp(resized_image, "resized.bmp");
20932093        normalize_image_u8_to_f32 (resized_image, *img_f32, ctx->image_mean , ctx->image_std );
2094-         res_imgs->push_back (std::move (img_f32));
2094+         res_imgs->entries . push_back (std::move (img_f32));
20952095        return  true ;
20962096    }
20972097
@@ -2106,12 +2106,12 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
21062106        pad_to_square = false ;
21072107    }
21082108    //  free the previous res_imgs if any set
2109-     res_imgs->clear ();
2109+     res_imgs->entries . clear ();
21102110
21112111    //  the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
21122112    //  see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
21132113
2114-     clip_image_u8_ptr temp; //  we will keep the input image data here temporarily
2114+     clip_image_u8_ptr temp ( clip_image_u8_init ()) ; //  we will keep the input image data here temporarily
21152115    if  (pad_to_square && img->nx  != img->ny ) {
21162116        int  longer_side = std::max (img->nx , img->ny );
21172117        temp->nx  = longer_side;
@@ -2156,14 +2156,14 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
21562156
21572157            std::vector<clip_image_u8_ptr> patches = divide_to_patches_u8 (*temp, params.image_size ); //  prepare spatial sorted main patches of image_size each (336 in llava-1.6)
21582158
2159-             clip_image_u8_ptr image_original_resize;
2159+             clip_image_u8_ptr image_original_resize ( clip_image_u8_init ()) ;
21602160            //  bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
21612161            bicubic_resize (*img, *image_original_resize, params.image_size , params.image_size ); //  in python this is "shortest_edge", but all CLIP are square
21622162            patches.insert (patches.begin (), std::move (image_original_resize));
21632163            for  (auto  & patch : patches) {
2164-                 clip_image_f32_ptr res;
2164+                 clip_image_f32_ptr res ( clip_image_f32_init ()) ;
21652165                normalize_image_u8_to_f32 (*patch, *res, ctx->image_mean , ctx->image_std );
2166-                 res_imgs->push_back (std::move (res));
2166+                 res_imgs->entries . push_back (std::move (res));
21672167            }
21682168
21692169            return  true ;
@@ -2181,7 +2181,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
21812181
21822182    const  int  nx2 = ctx->vision_model .hparams .image_size ;
21832183    const  int  ny2 = ctx->vision_model .hparams .image_size ;
2184-     clip_image_f32_ptr res;
2184+     clip_image_f32_ptr res ( clip_image_f32_init ()) ;
21852185    res->nx  = nx2;
21862186    res->ny  = ny2;
21872187    res->buf .resize (3  * nx2 * ny2);
@@ -2242,7 +2242,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
22422242    //  }
22432243    //  res_imgs.push_back(res);
22442244
2245-     res_imgs->push_back (std::move (res));
2245+     res_imgs->entries . push_back (std::move (res));
22462246
22472247    return  true ;
22482248}
@@ -2424,9 +2424,9 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
24242424    }
24252425
24262426    clip_image_f32_batch imgs;
2427-     clip_image_f32_ptr img_copy;
2427+     clip_image_f32_ptr img_copy ( clip_image_f32_init ()) ;
24282428    *img_copy = *img;
2429-     imgs.push_back (std::move (img_copy));
2429+     imgs.entries . push_back (std::move (img_copy));
24302430
24312431    return  clip_image_batch_encode (ctx, n_threads, &imgs, vec);
24322432}
@@ -2439,7 +2439,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
24392439        return  false ;
24402440    }
24412441
2442-     int  batch_size = imgs.size ();
2442+     int  batch_size = imgs.entries . size ();
24432443    if  (ctx->has_llava_projector ) {
24442444        GGML_ASSERT (batch_size == 1 ); //  TODO: support multiple images
24452445    }
@@ -2466,8 +2466,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
24662466    int  image_size_width  = image_size;
24672467    int  image_size_height = image_size;
24682468    if  (ctx->has_minicpmv_projector  | ctx->has_qwen2vl_merger ) {
2469-         image_size_width  = imgs[0 ]->nx ;
2470-         image_size_height = imgs[0 ]->ny ;
2469+         image_size_width  = imgs. entries [0 ]->nx ;
2470+         image_size_height = imgs. entries [0 ]->ny ;
24712471    }
24722472    const  int  patch_size    = hparams.patch_size ;
24732473    const  int  num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
@@ -2479,9 +2479,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
24792479        struct  ggml_tensor  * inp_raw = ggml_graph_get_tensor (gf, " inp_raw" 
24802480        float  * data = (float  *)malloc (ggml_nbytes (inp_raw));
24812481
2482-         for  (size_t  i = 0 ; i < imgs.size (); i++) {
2483-             const  int  nx = imgs[i]->nx ;
2484-             const  int  ny = imgs[i]->ny ;
2482+         for  (size_t  i = 0 ; i < imgs.entries . size (); i++) {
2483+             const  int  nx = imgs. entries [i]->nx ;
2484+             const  int  ny = imgs. entries [i]->ny ;
24852485            if  (!(ctx->has_minicpmv_projector  | ctx->has_qwen2vl_merger )) {
24862486                GGML_ASSERT (nx == image_size && ny == image_size);
24872487            }
@@ -2492,7 +2492,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
24922492                for  (int  k = 0 ; k < 3 ; k++) {
24932493                    for  (int  y = 0 ; y < ny; y++) {
24942494                        for  (int  x = 0 ; x < nx; x++) {
2495-                             data[(b * 3  * n) + k * n + y * nx + x] = imgs[b]->buf [3  * (y * nx + x) + k];
2495+                             data[(b * 3  * n) + k * n + y * nx + x] = imgs. entries [b]->buf [3  * (y * nx + x) + k];
24962496                        }
24972497                    }
24982498                }
0 commit comments