55import  torch 
66import  numpy  as  np 
77from  gguf  import  * 
8- from  transformers  import  CLIPModel , CLIPProcessor 
8+ from  transformers  import  CLIPModel , CLIPProcessor ,  CLIPVisionModel 
99
1010TEXT  =  "clip.text" 
1111VISION  =  "clip.vision" 
@@ -78,11 +78,19 @@ def bytes_to_unicode():
7878                help = "Save a text-only model. It can't be used to encode images" )
7979ap .add_argument ("--vision-only" , action = "store_true" , required = False ,
8080                help = "Save a vision-only model. It can't be used to encode texts" )
81+ ap .add_argument ("--clip_model_is_vision" , action = "store_true" , required = False ,
82+                 help = "The clip model is a pure vision model (ShareGPT4V vision extract for example)" )
8183ap .add_argument ("--llava-projector" , help = "Path to llava.projector file. If specified, save an image encoder for LLaVA models." )
8284ap .add_argument ("--image-mean" , nargs = 3 , type = float , required = False , help = "Override image mean values" )
8385ap .add_argument ("--image-std" , nargs = 3 , type = float , required = False , help = "Override image std values" )
8486ap .add_argument ("-o" , "--output-dir" , help = "Directory to save GGUF files. Default is the original model directory" , default = None )
87+ # Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 
88+ default_image_mean  =  [0.48145466 , 0.4578275 , 0.40821073 ]
89+ default_image_std  =  [0.26862954 , 0.26130258 , 0.27577711 ]
90+ ap .add_argument ('--image_mean' , type = float , nargs = '+' , help = 'Mean of the images for normalization (overrides processor) ' , default = None )
91+ ap .add_argument ('--image_std' , type = float , nargs = '+' , help = 'Standard deviation of the images for normalization (overrides processor)' , default = None )
8592
93+ # with proper 
8694args  =  ap .parse_args ()
8795
8896
@@ -96,15 +104,22 @@ def bytes_to_unicode():
96104# output in the same directory as the model if output_dir is None 
97105dir_model  =  args .model_dir 
98106
99- 
100- with  open (dir_model  +  "/vocab.json" , "r" , encoding = "utf-8" ) as  f :
101-     vocab  =  json .load (f )
102-     tokens  =  [key  for  key  in  vocab ]
107+ if  args .clip_model_is_vision :
108+     vocab  =  None 
109+     tokens  =  None 
110+ else :
111+     with  open (dir_model  +  "/vocab.json" , "r" , encoding = "utf-8" ) as  f :
112+         vocab  =  json .load (f )
113+         tokens  =  [key  for  key  in  vocab ]
103114
104115with  open (dir_model  +  "/config.json" , "r" , encoding = "utf-8" ) as  f :
105116    config  =  json .load (f )
106-     v_hparams  =  config ["vision_config" ]
107-     t_hparams  =  config ["text_config" ]
117+     if  args .clip_model_is_vision :
118+         v_hparams  =  config 
119+         t_hparams  =  None 
120+     else :
121+         v_hparams  =  config ["vision_config" ]
122+         t_hparams  =  config ["text_config" ]
108123
109124# possible data types 
110125#   ftype == 0 -> float32 
@@ -117,9 +132,12 @@ def bytes_to_unicode():
117132if  args .use_f32 :
118133    ftype  =  0 
119134
120- 
121- model  =  CLIPModel .from_pretrained (dir_model )
122- processor  =  CLIPProcessor .from_pretrained (dir_model )
135+ if  args .clip_model_is_vision :
136+     model  =  CLIPVisionModel .from_pretrained (dir_model )
137+     processor  =  None 
138+ else :
139+     model  =  CLIPModel .from_pretrained (dir_model )
140+     processor  =  CLIPProcessor .from_pretrained (dir_model )
123141
124142fname_middle  =  None 
125143has_text_encoder  =  True 
@@ -128,13 +146,13 @@ def bytes_to_unicode():
128146if  args .text_only :
129147    fname_middle  =  "text-" 
130148    has_vision_encoder  =  False 
131- elif  args .vision_only :
132-     fname_middle  =  "vision-" 
133-     has_text_encoder  =  False 
134149elif  args .llava_projector  is  not   None :
135150    fname_middle  =  "mmproj-" 
136151    has_text_encoder  =  False 
137152    has_llava_projector  =  True 
153+ elif  args .vision_only :
154+     fname_middle  =  "vision-" 
155+     has_text_encoder  =  False 
138156else :
139157    fname_middle  =  "" 
140158
@@ -182,8 +200,12 @@ def bytes_to_unicode():
182200    block_count  =  v_hparams ["num_hidden_layers" ] -  1  if  has_llava_projector  else  v_hparams ["num_hidden_layers" ]
183201    fout .add_uint32 (k (KEY_BLOCK_COUNT , VISION ), block_count )
184202
185-     image_mean  =  processor .image_processor .image_mean  if  args .image_mean  is  None  else  args .image_mean 
186-     image_std  =  processor .image_processor .image_std  if  args .image_std  is  None  else  args .image_std 
203+     if  processor  is  not   None :
204+         image_mean  =  processor .image_processor .image_mean  if  args .image_mean  is  None  or  args .image_mean  ==  default_image_mean  else  args .image_mean 
205+         image_std  =  processor .image_processor .image_std  if  args .image_std  is  None  or  args .image_std  ==  default_image_std  else  args .image_std 
206+     else :
207+         image_mean  =  args .image_mean  if  args .image_mean  is  not   None  else  default_image_mean 
208+         image_std  =  args .image_std  if  args .image_std  is  not   None  else  default_image_std 
187209    fout .add_array ("clip.vision.image_mean" , image_mean )
188210    fout .add_array ("clip.vision.image_std" , image_std )
189211
0 commit comments