From 85355b31f6188ee5c8745a4d6b340803efd4982f Mon Sep 17 00:00:00 2001
From: mudler <2420543+mudler@users.noreply.github.com>
Date: Sat, 1 Nov 2025 06:13:28 +0000
Subject: [PATCH] chore(model gallery): :robot: add new models via gallery
 agent

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 gallery/index.yaml | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index e25cdec66717..86941a674d6f 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -23363,3 +23363,29 @@
     - filename: Qwen3-Grand-Horror-Light-1.7B.Q4_K_M.gguf
       sha256: cbbb0c5f6874130a8ae253377fdc7ad25fa2c1e9bb45f1aaad88db853ef985dc
       uri: huggingface://mradermacher/Qwen3-Grand-Horror-Light-1.7B-GGUF/Qwen3-Grand-Horror-Light-1.7B.Q4_K_M.gguf
+- !!merge <<: *qwen3vl
+  name: "qwen.qwen3-vl-235b-a22b-instruct"
+  urls:
+    - https://huggingface.co/DevQuasar/Qwen.Qwen3-VL-235B-A22B-Instruct-GGUF
+  description: |
+    **Qwen3-VL-235B-A22B-Instruct** is a state-of-the-art vision-language model from the Qwen series, designed for advanced multimodal understanding and reasoning. With a massive 235 billion parameters, it excels in both visual and textual comprehension, supporting complex tasks such as image captioning, visual question answering, document understanding, and spatial reasoning.
+
+    Key features include:
+    - **Ultra-long context** (up to 1M tokens), enabling deep analysis of books, long videos, and detailed documents.
+    - **Advanced visual perception** with high-precision object detection, spatial reasoning, and 3D grounding.
+    - **Multilingual OCR** (32 languages) with strong performance in low-light, blurry, or tilted conditions.
+    - **Visual agent capabilities**: Can interpret and interact with GUIs on PC or mobile devices.
+    - **Code generation from visuals**: Converts images and videos into HTML, CSS, JavaScript, and Draw.io diagrams.
+    - **Enhanced multimodal reasoning** for STEM problems, causal analysis, and logical inference.
+    - Built-in support for **interleaved-MRoPE**, **DeepStack fusion**, and **text-timestamp alignment** for superior temporal and spatial modeling.
+
+    Available in both dense and Mixture-of-Experts (MoE) architectures, it is optimized for deployment across edge and cloud environments. Ideal for research, AI agents, and enterprise applications requiring deep vision-language synergy.
+
+    *Note: This is the original model from Alibaba’s Qwen team. The GGUF version available at DevQuasar/Qwen.Qwen3-VL-235B-A22B-Instruct-GGUF is a quantized variant for efficient local inference.*
+  overrides:
+    parameters:
+      model: Qwen.Qwen3-VL-235B-A22B-Instruct.Q4_K_M-00001-of-00011.gguf
+  files:
+    - filename: Qwen.Qwen3-VL-235B-A22B-Instruct.Q4_K_M-00001-of-00011.gguf
+      sha256: 41c963cc019dbb4d946ca7ff69baed17181b33dd7dbd8498d75dfccb21549fe0
+      uri: huggingface://DevQuasar/Qwen.Qwen3-VL-235B-A22B-Instruct-GGUF/Qwen.Qwen3-VL-235B-A22B-Instruct.Q4_K_M-00001-of-00011.gguf