From b0d30bdc10320918d276e8fe8031c879d5103c80 Mon Sep 17 00:00:00 2001
From: mudler <2420543+mudler@users.noreply.github.com>
Date: Fri, 31 Oct 2025 19:35:31 +0000
Subject: [PATCH] chore(model gallery): :robot: add new models via gallery
 agent

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 gallery/index.yaml | 54 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index e25cdec66717..258af3b9f9e1 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -23363,3 +23363,57 @@
     - filename: Qwen3-Grand-Horror-Light-1.7B.Q4_K_M.gguf
       sha256: cbbb0c5f6874130a8ae253377fdc7ad25fa2c1e9bb45f1aaad88db853ef985dc
       uri: huggingface://mradermacher/Qwen3-Grand-Horror-Light-1.7B-GGUF/Qwen3-Grand-Horror-Light-1.7B.Q4_K_M.gguf
+- !!merge <<: *qwen3vl
+  name: "qwen3-vl-8b-instruct"
+  urls:
+    - https://huggingface.co/Mungert/Qwen3-VL-8B-Instruct-GGUF
+  description: |
+    ### **Qwen3-VL-8B-Instruct**
+    *by Qwen Team (Hugging Face)*
+
+    A state-of-the-art vision-language model designed for rich multimodal understanding and reasoning. Built on a powerful architecture with **8 billion parameters**, Qwen3-VL-8B-Instruct excels in visual perception, spatial reasoning, and long-context multimodal tasks.
+
+    #### 🔍 **Key Features**:
+    - **256K native context length** (expandable to 1M), ideal for long documents, videos, and complex scenes.
+    - **Advanced spatial & video understanding** with precise object localization and timestamp-aware reasoning.
+    - **Strong multimodal reasoning** in STEM, logic, and real-world tasks—perfect for agent-based applications.
+    - **Visual coding support**: generates HTML/CSS/JS, Draw.io diagrams, and code from images.
+    - **High-precision OCR** across **32 languages**, including low-light, blurred, and ancient scripts.
+    - **Visual Agent** capability: interprets and interacts with GUIs, tools, and workflows.
+
+    #### 🛠️ **Architecture Highlights**:
+    - **Interleaved-MRoPE**: Enhanced positional encoding for better video and temporal reasoning.
+    - **DeepStack**: Fuses multi-level visual features for sharper image-text alignment.
+    - **Text–Timestamp Alignment**: Enables precise event localization in long videos.
+
+    #### 📌 **Use Cases**:
+    - Image/video captioning & analysis
+    - Visual question answering (VQA)
+    - Document understanding & extraction
+    - GUI automation & agent-based tasks
+    - Long-form content synthesis (books, research, video summaries)
+
+    #### 📚 **Citation**:
+    ```bibtex
+    @misc{qwen3technicalreport,
+      title={Qwen3 Technical Report},
+      author={Qwen Team},
+      year={2025},
+      eprint={2505.09388},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2505.09388}
+    }
+    ```
+
+    > ✅ **Official Hugging Face model**: [`Qwen/Qwen3-VL-8B-Instruct`](https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct)
+    > 🚀 **Try it live**: [Chat with Qwen3-VL](https://chat.qwenlm.ai/)
+
+    *Note: The GGUF version (Mungert/Qwen3-VL-8B-Instruct-GGUF) is a user-quantized variant and not the original model by the Qwen team.*
+  overrides:
+    parameters:
+      model: Qwen3-VL-8B-Instruct-q4_k_m.gguf
+  files:
+    - filename: Qwen3-VL-8B-Instruct-q4_k_m.gguf
+      sha256: a4d0b6e9d97ed31053fce7e2466c775ef39919bb86b1c56309b0e9089d540d45
+      uri: huggingface://Mungert/Qwen3-VL-8B-Instruct-GGUF/Qwen3-VL-8B-Instruct-q4_k_m.gguf