diff --git a/build.sbt b/build.sbt
index 2e2853e89eadd1..5d6bc3cb7de7a0 100644
--- a/build.sbt
+++ b/build.sbt
@@ -6,7 +6,7 @@ name := getPackageName(is_silicon, is_gpu, is_aarch64)
 
 organization := "com.johnsnowlabs.nlp"
 
-version := "6.0.5"
+version := "6.1.0-rc1"
 
 (ThisBuild / scalaVersion) := scalaVer
 
diff --git a/project/Dependencies.scala b/project/Dependencies.scala
index 22b306b0354c9c..cfdcba4ca0ada7 100644
--- a/project/Dependencies.scala
+++ b/project/Dependencies.scala
@@ -128,11 +128,11 @@ object Dependencies {
   val azureIdentity = "com.azure" % "azure-identity" % azureIdentityVersion % Provided
   val azureStorage = "com.azure" % "azure-storage-blob" % azureStorageVersion % Provided
 
-  val llamaCppVersion = "0.1.6"
-  val llamaCppCPU = "com.johnsnowlabs.nlp" %% "jsl-llamacpp-cpu" % llamaCppVersion
-  val llamaCppGPU = "com.johnsnowlabs.nlp" %% "jsl-llamacpp-gpu" % llamaCppVersion
-  val llamaCppSilicon = "com.johnsnowlabs.nlp" %% "jsl-llamacpp-silicon" % llamaCppVersion
-  val llamaCppAarch64 = "com.johnsnowlabs.nlp" %% "jsl-llamacpp-aarch64" % llamaCppVersion
+  val llamaCppVersion = "1.0.1"
+  val llamaCppCPU = "com.johnsnowlabs.nlp" % "jsl-llamacpp-cpu" % llamaCppVersion
+  val llamaCppGPU = "com.johnsnowlabs.nlp" % "jsl-llamacpp-gpu" % llamaCppVersion
+  val llamaCppSilicon = "com.johnsnowlabs.nlp" % "jsl-llamacpp-silicon" % llamaCppVersion
+  val llamaCppAarch64 = "com.johnsnowlabs.nlp" % "jsl-llamacpp-aarch64" % llamaCppVersion
 
   val jsoupVersion = "1.18.2"
 
diff --git a/python/sparknlp/annotator/seq2seq/auto_gguf_model.py b/python/sparknlp/annotator/seq2seq/auto_gguf_model.py
index 37c96319564782..2d01c29fea57fc 100755
--- a/python/sparknlp/annotator/seq2seq/auto_gguf_model.py
+++ b/python/sparknlp/annotator/seq2seq/auto_gguf_model.py
@@ -253,7 +253,9 @@ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFMo
             nCtx=4096,
             nBatch=512,
             embedding=False,
-            nPredict=100
+            nPredict=100,
+            nGpuLayers=99,
+            systemPrompt="You are a helpful assistant."
         )
 
     @staticmethod
diff --git a/python/sparknlp/common/properties.py b/python/sparknlp/common/properties.py
index a134a86ac562eb..4cfbf70c5f0dd8 100644
--- a/python/sparknlp/common/properties.py
+++ b/python/sparknlp/common/properties.py
@@ -765,14 +765,14 @@ class HasLlamaCppProperties:
     # -------- MODEl PARAMETERS --------
     nThreads = Param(Params._dummy(), "nThreads", "Set the number of threads to use during generation",
                      typeConverter=TypeConverters.toInt)
-    nThreadsDraft = Param(Params._dummy(), "nThreadsDraft", "Set the number of threads to use during draft generation",
-                          typeConverter=TypeConverters.toInt)
+    # nThreadsDraft = Param(Params._dummy(), "nThreadsDraft", "Set the number of threads to use during draft generation",
+    #                       typeConverter=TypeConverters.toInt)
     nThreadsBatch = Param(Params._dummy(), "nThreadsBatch",
                           "Set the number of threads to use during batch and prompt processing",
                           typeConverter=TypeConverters.toInt)
-    nThreadsBatchDraft = Param(Params._dummy(), "nThreadsBatchDraft",
-                               "Set the number of threads to use during batch and prompt processing",
-                               typeConverter=TypeConverters.toInt)
+    # nThreadsBatchDraft = Param(Params._dummy(), "nThreadsBatchDraft",
+    #                            "Set the number of threads to use during batch and prompt processing",
+    #                            typeConverter=TypeConverters.toInt)
     nCtx = Param(Params._dummy(), "nCtx", "Set the size of the prompt context", typeConverter=TypeConverters.toInt)
     nBatch = Param(Params._dummy(), "nBatch",
                    "Set the logical batch size for prompt processing (must be >=32 to use BLAS)",
@@ -782,12 +782,12 @@ class HasLlamaCppProperties:
                     typeConverter=TypeConverters.toInt)
     nDraft = Param(Params._dummy(), "nDraft", "Set the number of tokens to draft for speculative decoding",
                    typeConverter=TypeConverters.toInt)
-    nChunks = Param(Params._dummy(), "nChunks", "Set the maximal number of chunks to process",
-                    typeConverter=TypeConverters.toInt)
-    nSequences = Param(Params._dummy(), "nSequences", "Set the number of sequences to decode",
-                       typeConverter=TypeConverters.toInt)
-    pSplit = Param(Params._dummy(), "pSplit", "Set the speculative decoding split probability",
-                   typeConverter=TypeConverters.toFloat)
+    # nChunks = Param(Params._dummy(), "nChunks", "Set the maximal number of chunks to process",
+    #                 typeConverter=TypeConverters.toInt)
+    # nSequences = Param(Params._dummy(), "nSequences", "Set the number of sequences to decode",
+    #                    typeConverter=TypeConverters.toInt)
+    # pSplit = Param(Params._dummy(), "pSplit", "Set the speculative decoding split probability",
+    #                typeConverter=TypeConverters.toFloat)
     nGpuLayers = Param(Params._dummy(), "nGpuLayers", "Set the number of layers to store in VRAM (-1 - use default)",
                        typeConverter=TypeConverters.toInt)
     nGpuLayersDraft = Param(Params._dummy(), "nGpuLayersDraft",
@@ -802,10 +802,10 @@ class HasLlamaCppProperties:
                          typeConverter=TypeConverters.toString)
     mainGpu = Param(Params._dummy(), "mainGpu", "Set the main GPU that is used for scratch and small tensors.",
                     typeConverter=TypeConverters.toInt)
-    tensorSplit = Param(Params._dummy(), "tensorSplit", "Set how split tensors should be distributed across GPUs",
-                        typeConverter=TypeConverters.toListFloat)
-    grpAttnN = Param(Params._dummy(), "grpAttnN", "Set the group-attention factor", typeConverter=TypeConverters.toInt)
-    grpAttnW = Param(Params._dummy(), "grpAttnW", "Set the group-attention width", typeConverter=TypeConverters.toInt)
+    # tensorSplit = Param(Params._dummy(), "tensorSplit", "Set how split tensors should be distributed across GPUs",
+    #                     typeConverter=TypeConverters.toListFloat)
+    # grpAttnN = Param(Params._dummy(), "grpAttnN", "Set the group-attention factor", typeConverter=TypeConverters.toInt)
+    # grpAttnW = Param(Params._dummy(), "grpAttnW", "Set the group-attention width", typeConverter=TypeConverters.toInt)
     ropeFreqBase = Param(Params._dummy(), "ropeFreqBase", "Set the RoPE base frequency, used by NTK-aware scaling",
                          typeConverter=TypeConverters.toFloat)
     ropeFreqScale = Param(Params._dummy(), "ropeFreqScale",
@@ -837,7 +837,7 @@ class HasLlamaCppProperties:
                          typeConverter=TypeConverters.toString)
     # Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
     #
-    #   - UNSPECIFIED: Don't use any scaling
+    #   - NONE: Don't use any scaling
     #   - LINEAR: Linear scaling
     #   - YARN: YaRN RoPE scaling
     ropeScalingType = Param(Params._dummy(), "ropeScalingType",
@@ -848,26 +848,28 @@ class HasLlamaCppProperties:
     #   - 0 NONE: Don't use any pooling
     #   - 1 MEAN: Mean Pooling
     #   - 2 CLS: CLS Pooling
+    #   - 3 LAST: Last token pooling
+    #   - 4 RANK: For reranked models
     poolingType = Param(Params._dummy(), "poolingType",
                         "Set the pooling type for embeddings, use model default if unspecified",
                         typeConverter=TypeConverters.toString)
     modelDraft = Param(Params._dummy(), "modelDraft", "Set the draft model for speculative decoding",
                        typeConverter=TypeConverters.toString)
     modelAlias = Param(Params._dummy(), "modelAlias", "Set a model alias", typeConverter=TypeConverters.toString)
-    lookupCacheStaticFilePath = Param(Params._dummy(), "lookupCacheStaticFilePath",
-                                      "Set path to static lookup cache to use for lookup decoding (not updated by generation)",
-                                      typeConverter=TypeConverters.toString)
-    lookupCacheDynamicFilePath = Param(Params._dummy(), "lookupCacheDynamicFilePath",
-                                       "Set path to dynamic lookup cache to use for lookup decoding (updated by generation)",
-                                       typeConverter=TypeConverters.toString)
+    # lookupCacheStaticFilePath = Param(Params._dummy(), "lookupCacheStaticFilePath",
+    #                                   "Set path to static lookup cache to use for lookup decoding (not updated by generation)",
+    #                                   typeConverter=TypeConverters.toString)
+    # lookupCacheDynamicFilePath = Param(Params._dummy(), "lookupCacheDynamicFilePath",
+    #                                    "Set path to dynamic lookup cache to use for lookup decoding (updated by generation)",
+    #                                    typeConverter=TypeConverters.toString)
     # loraAdapters = new StructFeature[Map[String, Float]](this, "loraAdapters")
     embedding = Param(Params._dummy(), "embedding", "Whether to load model with embedding support",
                       typeConverter=TypeConverters.toBoolean)
     flashAttention = Param(Params._dummy(), "flashAttention", "Whether to enable Flash Attention",
                            typeConverter=TypeConverters.toBoolean)
-    inputPrefixBos = Param(Params._dummy(), "inputPrefixBos",
-                           "Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string",
-                           typeConverter=TypeConverters.toBoolean)
+    # inputPrefixBos = Param(Params._dummy(), "inputPrefixBos",
+    #                        "Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string",
+    #                        typeConverter=TypeConverters.toBoolean)
     useMmap = Param(Params._dummy(), "useMmap",
                     "Whether to use memory-map model (faster load but may increase pageouts if not using mlock)",
                     typeConverter=TypeConverters.toBoolean)
@@ -948,17 +950,17 @@ def setNThreads(self, nThreads: int):
         """Set the number of threads to use during generation"""
         return self._set(nThreads=nThreads)
 
-    def setNThreadsDraft(self, nThreadsDraft: int):
-        """Set the number of threads to use during draft generation"""
-        return self._set(nThreadsDraft=nThreadsDraft)
+    # def setNThreadsDraft(self, nThreadsDraft: int):
+    #     """Set the number of threads to use during draft generation"""
+    #     return self._set(nThreadsDraft=nThreadsDraft)
 
     def setNThreadsBatch(self, nThreadsBatch: int):
         """Set the number of threads to use during batch and prompt processing"""
         return self._set(nThreadsBatch=nThreadsBatch)
 
-    def setNThreadsBatchDraft(self, nThreadsBatchDraft: int):
-        """Set the number of threads to use during batch and prompt processing"""
-        return self._set(nThreadsBatchDraft=nThreadsBatchDraft)
+    # def setNThreadsBatchDraft(self, nThreadsBatchDraft: int):
+    #     """Set the number of threads to use during batch and prompt processing"""
+    #     return self._set(nThreadsBatchDraft=nThreadsBatchDraft)
 
     def setNCtx(self, nCtx: int):
         """Set the size of the prompt context"""
@@ -976,17 +978,17 @@ def setNDraft(self, nDraft: int):
         """Set the number of tokens to draft for speculative decoding"""
         return self._set(nDraft=nDraft)
 
-    def setNChunks(self, nChunks: int):
-        """Set the maximal number of chunks to process"""
-        return self._set(nChunks=nChunks)
+    # def setNChunks(self, nChunks: int):
+    #     """Set the maximal number of chunks to process"""
+    #     return self._set(nChunks=nChunks)
 
-    def setNSequences(self, nSequences: int):
-        """Set the number of sequences to decode"""
-        return self._set(nSequences=nSequences)
+    # def setNSequences(self, nSequences: int):
+    #     """Set the number of sequences to decode"""
+    #     return self._set(nSequences=nSequences)
 
-    def setPSplit(self, pSplit: float):
-        """Set the speculative decoding split probability"""
-        return self._set(pSplit=pSplit)
+    # def setPSplit(self, pSplit: float):
+    #     """Set the speculative decoding split probability"""
+    #     return self._set(pSplit=pSplit)
 
     def setNGpuLayers(self, nGpuLayers: int):
         """Set the number of layers to store in VRAM (-1 - use default)"""
@@ -1004,17 +1006,17 @@ def setMainGpu(self, mainGpu: int):
         """Set the main GPU that is used for scratch and small tensors."""
         return self._set(mainGpu=mainGpu)
 
-    def setTensorSplit(self, tensorSplit: List[float]):
-        """Set how split tensors should be distributed across GPUs"""
-        return self._set(tensorSplit=tensorSplit)
+    # def setTensorSplit(self, tensorSplit: List[float]):
+    #     """Set how split tensors should be distributed across GPUs"""
+    #     return self._set(tensorSplit=tensorSplit)
 
-    def setGrpAttnN(self, grpAttnN: int):
-        """Set the group-attention factor"""
-        return self._set(grpAttnN=grpAttnN)
+    # def setGrpAttnN(self, grpAttnN: int):
+    #     """Set the group-attention factor"""
+    #     return self._set(grpAttnN=grpAttnN)
 
-    def setGrpAttnW(self, grpAttnW: int):
-        """Set the group-attention width"""
-        return self._set(grpAttnW=grpAttnW)
+    # def setGrpAttnW(self, grpAttnW: int):
+    #     """Set the group-attention width"""
+    #     return self._set(grpAttnW=grpAttnW)
 
     def setRopeFreqBase(self, ropeFreqBase: float):
         """Set the RoPE base frequency, used by NTK-aware scaling"""
@@ -1049,7 +1051,16 @@ def setDefragmentationThreshold(self, defragmentationThreshold: float):
         return self._set(defragmentationThreshold=defragmentationThreshold)
 
     def setNumaStrategy(self, numaStrategy: str):
-        """Set optimization strategies that help on some NUMA systems (if available)"""
+        """Set optimization strategies that help on some NUMA systems (if available)
+
+        Possible values:
+
+        - DISABLED: No NUMA optimizations
+        - DISTRIBUTE: spread execution evenly over all
+        - ISOLATE: only spawn threads on CPUs on the node that execution started on
+        - NUMA_CTL: use the CPU map provided by numactl
+        - MIRROR: Mirrors the model across NUMA nodes
+        """
         numaUpper = numaStrategy.upper()
         numaStrategies = ["DISABLED", "DISTRIBUTE", "ISOLATE", "NUMA_CTL", "MIRROR"]
         if numaUpper not in numaStrategies:
@@ -1060,13 +1071,36 @@ def setNumaStrategy(self, numaStrategy: str):
         return self._set(numaStrategy=numaStrategy)
 
     def setRopeScalingType(self, ropeScalingType: str):
-        """Set the RoPE frequency scaling method, defaults to linear unless specified by the model"""
-        return self._set(ropeScalingType=ropeScalingType)
+        """Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
+
+        Possible values:
+
+        - NONE: Don't use any scaling
+        - LINEAR: Linear scaling
+        - YARN: YaRN RoPE scaling
+        """
+        ropeScalingTypeUpper = ropeScalingType.upper()
+        ropeScalingTypes = ["NONE", "LINEAR", "YARN"]
+        if ropeScalingTypeUpper not in ropeScalingTypes:
+           raise ValueError(
+               f"Invalid RoPE scaling type: {ropeScalingType}. "
+               + f"Valid values are: {ropeScalingTypes}"
+           )
+        return self._set(ropeScalingType=ropeScalingTypeUpper)
 
     def setPoolingType(self, poolingType: str):
-        """Set the pooling type for embeddings, use model default if unspecified"""
+        """Set the pooling type for embeddings, use model default if unspecified
+
+        Possible values:
+
+        - 0 NONE: Don't use any pooling
+        - 1 MEAN: Mean Pooling
+        - 2 CLS: CLS Pooling
+        - 3 LAST: Last token pooling
+        - 4 RANK: For reranked models
+        """
         poolingTypeUpper = poolingType.upper()
-        poolingTypes = ["NONE", "MEAN", "CLS", "LAST"]
+        poolingTypes = ["NONE", "MEAN", "CLS", "LAST", "RANK"]
         if poolingTypeUpper not in poolingTypes:
             raise ValueError(
                 f"Invalid pooling type: {poolingType}. "
@@ -1082,13 +1116,13 @@ def setModelAlias(self, modelAlias: str):
         """Set a model alias"""
         return self._set(modelAlias=modelAlias)
 
-    def setLookupCacheStaticFilePath(self, lookupCacheStaticFilePath: str):
-        """Set path to static lookup cache to use for lookup decoding (not updated by generation)"""
-        return self._set(lookupCacheStaticFilePath=lookupCacheStaticFilePath)
+    # def setLookupCacheStaticFilePath(self, lookupCacheStaticFilePath: str):
+    #     """Set path to static lookup cache to use for lookup decoding (not updated by generation)"""
+    #     return self._set(lookupCacheStaticFilePath=lookupCacheStaticFilePath)
 
-    def setLookupCacheDynamicFilePath(self, lookupCacheDynamicFilePath: str):
-        """Set path to dynamic lookup cache to use for lookup decoding (updated by generation)"""
-        return self._set(lookupCacheDynamicFilePath=lookupCacheDynamicFilePath)
+    # def setLookupCacheDynamicFilePath(self, lookupCacheDynamicFilePath: str):
+    #     """Set path to dynamic lookup cache to use for lookup decoding (updated by generation)"""
+    #     return self._set(lookupCacheDynamicFilePath=lookupCacheDynamicFilePath)
 
     def setEmbedding(self, embedding: bool):
         """Whether to load model with embedding support"""
@@ -1098,9 +1132,9 @@ def setFlashAttention(self, flashAttention: bool):
         """Whether to enable Flash Attention"""
         return self._set(flashAttention=flashAttention)
 
-    def setInputPrefixBos(self, inputPrefixBos: bool):
-        """Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string"""
-        return self._set(inputPrefixBos=inputPrefixBos)
+    # def setInputPrefixBos(self, inputPrefixBos: bool):
+    #     """Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string"""
+    #     return self._set(inputPrefixBos=inputPrefixBos)
 
     def setUseMmap(self, useMmap: bool):
         """Whether to use memory-map model (faster load but may increase pageouts if not using mlock)"""
@@ -1260,9 +1294,9 @@ def setTokenBias(self, tokenBias: Dict[str, float]):
         """Set token id bias"""
         return self._call_java("setTokenBias", tokenBias)
 
-    def setLoraAdapters(self, loraAdapters: Dict[str, float]):
-        """Set LoRA adapters with their scaling factors"""
-        return self._call_java("setLoraAdapters", loraAdapters)
+    # def setLoraAdapters(self, loraAdapters: Dict[str, float]):
+    #     """Set LoRA adapters with their scaling factors"""
+    #     return self._call_java("setLoraAdapters", loraAdapters)
 
     def getMetadata(self):
         """Gets the metadata of the model"""
diff --git a/python/test/annotator/embeddings/auto_gguf_embeddings_test.py b/python/test/annotator/embeddings/auto_gguf_embeddings_test.py
index 0f9ebe0b4ea247..a8c7fa01838f37 100644
--- a/python/test/annotator/embeddings/auto_gguf_embeddings_test.py
+++ b/python/test/annotator/embeddings/auto_gguf_embeddings_test.py
@@ -153,8 +153,8 @@ def runTest(self):
             .setInputCols("document")
             .setOutputCol("embeddings")
             .setBatchSize(4)
-            .setNUbatch(2048)
-            .setNBatch(2048)
+            .setNUbatch(4096)
+            .setNBatch(4096)
         )
         pipeline = Pipeline().setStages([self.document_assembler, model])
         results = pipeline.fit(self.long_data).transform(self.long_data)
diff --git a/python/test/annotator/seq2seq/auto_gguf_model_test.py b/python/test/annotator/seq2seq/auto_gguf_model_test.py
index cb014591ae33bc..e34c3a956ff2bb 100644
--- a/python/test/annotator/seq2seq/auto_gguf_model_test.py
+++ b/python/test/annotator/seq2seq/auto_gguf_model_test.py
@@ -49,7 +49,7 @@ def runTest(self):
             .setOutputCol("completions")
             .setBatchSize(4)
             .setNPredict(20)
-            .setNGpuLayers(5)
+            .setNGpuLayers(99)
             .setTemperature(0.4)
             .setTopK(40)
             .setTopP(0.9)
@@ -78,7 +78,7 @@ def runTest(self):
             DocumentAssembler().setInputCol("text").setOutputCol("document")
         )
 
-        model = (
+        model: AutoGGUFModel = (
             AutoGGUFModel.pretrained()
             .setInputCols("document")
             .setOutputCol("completions")
@@ -87,23 +87,23 @@ def runTest(self):
 
         # Model Parameters
         model.setNThreads(8)
-        model.setNThreadsDraft(8)
+        # model.setNThreadsDraft(8)
         model.setNThreadsBatch(8)
-        model.setNThreadsBatchDraft(8)
+        # model.setNThreadsBatchDraft(8)
         model.setNCtx(512)
         model.setNBatch(32)
         model.setNUbatch(32)
         model.setNDraft(5)
-        model.setNChunks(-1)
-        model.setNSequences(1)
-        model.setPSplit(0.1)
+        # model.setNChunks(-1)
+        # model.setNSequences(1)
+        # model.setPSplit(0.1)
         model.setNGpuLayers(99)
         model.setNGpuLayersDraft(99)
         model.setGpuSplitMode("NONE")
         model.setMainGpu(0)
-        model.setTensorSplit([])
-        model.setGrpAttnN(1)
-        model.setGrpAttnW(512)
+        # model.setTensorSplit([])
+        # model.setGrpAttnN(1)
+        # model.setGrpAttnW(512)
         model.setRopeFreqBase(1.0)
         model.setRopeFreqScale(1.0)
         model.setYarnExtFactor(1.0)
@@ -113,14 +113,14 @@ def runTest(self):
         model.setYarnOrigCtx(0)
         model.setDefragmentationThreshold(-1.0)
         model.setNumaStrategy("DISTRIBUTE")
-        model.setRopeScalingType("UNSPECIFIED")
+        model.setRopeScalingType("NONE")
         model.setPoolingType("NONE")
         model.setModelDraft("")
-        model.setLookupCacheStaticFilePath("/tmp/sparknlp-llama-cpp-cache")
-        model.setLookupCacheDynamicFilePath("/tmp/sparknlp-llama-cpp-cache")
+        # model.setLookupCacheStaticFilePath("/tmp/sparknlp-llama-cpp-cache")
+        # model.setLookupCacheDynamicFilePath("/tmp/sparknlp-llama-cpp-cache")
         model.setEmbedding(False)
         model.setFlashAttention(False)
-        model.setInputPrefixBos(False)
+        # model.setInputPrefixBos(False)
         model.setUseMmap(False)
         model.setUseMlock(False)
         model.setNoKvOffload(False)
@@ -164,7 +164,7 @@ def runTest(self):
         # Special PySpark Parameters (Scala StructFeatures)
         model.setTokenIdBias({0: 0.0, 1: 0.0})
         model.setTokenBias({"!": 0.0, "?": 0.0})
-        model.setLoraAdapters({" ": 0.0})
+        # model.setLoraAdapters({" ": 0.0})
 
         pipeline = Pipeline().setStages([document_assembler, model])
         results = pipeline.fit(data).transform(data)
diff --git a/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapper.scala b/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapper.scala
index 6f68ead3a51ef0..1d65a8daa567d6 100644
--- a/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapper.scala
+++ b/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapper.scala
@@ -15,8 +15,8 @@
  */
 package com.johnsnowlabs.ml.gguf
 
-import com.johnsnowlabs.nlp.llama.{LlamaModel, ModelParameters}
 import com.johnsnowlabs.nlp.util.io.ResourceHelper
+import de.kherud.llama.{LlamaModel, ModelParameters}
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.spark.SparkFiles
 import org.apache.spark.sql.SparkSession
@@ -42,7 +42,7 @@ class GGUFWrapper(var modelFileName: String, var modelFolder: String) extends Se
         val modelFilePath = SparkFiles.get(modelFileName)
 
         if (Paths.get(modelFilePath).toFile.exists()) {
-          modelParameters.setModelFilePath(modelFilePath)
+          modelParameters.setModel(modelFilePath)
           llamaModel = GGUFWrapper.withSafeGGUFModelLoader(modelParameters)
         } else
           throw new IllegalStateException(
diff --git a/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapperMultiModal.scala b/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapperMultiModal.scala
index 89eb8f517360f2..4f8fef32dd0904 100644
--- a/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapperMultiModal.scala
+++ b/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapperMultiModal.scala
@@ -15,7 +15,7 @@
  */
 package com.johnsnowlabs.ml.gguf
 
-import com.johnsnowlabs.nlp.llama.{LlamaModel, ModelParameters}
+import de.kherud.llama.{LlamaModel, ModelParameters}
 import com.johnsnowlabs.nlp.util.io.ResourceHelper
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.spark.SparkFiles
@@ -44,8 +44,8 @@ class GGUFWrapperMultiModal(var modelFileName: String, var mmprojFileName: Strin
           Paths.get(modelFilePath).toFile.exists() && Paths.get(mmprojFilePath).toFile.exists()
 
         if (filesExist) {
-          modelParameters.setModelFilePath(modelFilePath)
-          modelParameters.setMMProj(mmprojFilePath)
+          modelParameters.setModel(modelFilePath)
+//          modelParameters.setMMProj(mmprojFilePath) // TODO: Vision models implementation
           llamaModel = GGUFWrapperMultiModal.withSafeGGUFModelLoader(modelParameters)
         } else
           throw new IllegalStateException(
diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppInferenceProperties.scala b/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppInferenceProperties.scala
index e200610b38a2a9..fcc797ddbaf417 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppInferenceProperties.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppInferenceProperties.scala
@@ -1,8 +1,8 @@
 package com.johnsnowlabs.nlp
 
 import com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFModel
-import com.johnsnowlabs.nlp.llama.InferenceParameters
-import com.johnsnowlabs.nlp.llama.args._
+import de.kherud.llama.InferenceParameters
+import de.kherud.llama.args._
 import com.johnsnowlabs.nlp.serialization.StructFeature
 import org.apache.spark.ml.param._
 
diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppModelProperties.scala b/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppModelProperties.scala
index e71a7b999f25c2..2c4ddee89320c1 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppModelProperties.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppModelProperties.scala
@@ -1,18 +1,13 @@
 package com.johnsnowlabs.nlp
 
 import com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFModel
-import com.johnsnowlabs.nlp.llama.ModelParameters
-import com.johnsnowlabs.nlp.llama.args.{GpuSplitMode, NumaStrategy, PoolingType, RopeScalingType}
-import com.johnsnowlabs.nlp.serialization.StructFeature
+import de.kherud.llama.ModelParameters
+import de.kherud.llama.args.{GpuSplitMode, NumaStrategy, PoolingType, RopeScalingType}
 import org.apache.spark.ml.param._
-import org.apache.spark.sql.SparkSession
 import org.json4s.DefaultFormats
 import org.json4s.jackson.JsonMethods
 import org.slf4j.LoggerFactory
 
-import scala.collection.mutable
-import scala.jdk.CollectionConverters._
-
 /** Contains settable model parameters for the [[AutoGGUFModel]].
   *
   * @groupname param Parameters
@@ -34,10 +29,10 @@ trait HasLlamaCppModelProperties {
     new IntParam(this, "nThreads", "Set the number of threads to use during generation")
 
   /** @group param */
-  val nThreadsDraft = new IntParam(
-    this,
-    "nThreadsDraft",
-    "Set the number of threads to use during draft generation")
+//  val nThreadsDraft = new IntParam(
+//    this,
+//    "nThreadsDraft",
+//    "Set the number of threads to use during draft generation")
 
   /** @group param */
   val nThreadsBatch = new IntParam(
@@ -46,10 +41,10 @@ trait HasLlamaCppModelProperties {
     "Set the number of threads to use during batch and prompt processing")
 
   /** @group param */
-  val nThreadsBatchDraft = new IntParam(
-    this,
-    "nThreadsBatchDraft",
-    "Set the number of threads to use during batch and prompt processing")
+//  val nThreadsBatchDraft = new IntParam(
+//    this,
+//    "nThreadsBatchDraft",
+//    "Set the number of threads to use during batch and prompt processing")
 
   /** @group param */
   val nCtx = new IntParam(this, "nCtx", "Set the size of the prompt context")
@@ -71,14 +66,14 @@ trait HasLlamaCppModelProperties {
     new IntParam(this, "nDraft", "Set the number of tokens to draft for speculative decoding")
 
   /** @group param */
-  val nChunks = new IntParam(this, "nChunks", "Set the maximal number of chunks to process")
+//  val nChunks = new IntParam(this, "nChunks", "Set the maximal number of chunks to process")
 
   /** @group param */
-  val nSequences =
-    new IntParam(this, "nSequences", "Set the number of sequences to decode")
+//  val nSequences =
+//    new IntParam(this, "nSequences", "Set the number of sequences to decode")
 
   /** @group param */
-  val pSplit = new FloatParam(this, "pSplit", "Set the speculative decoding split probability")
+//  val pSplit = new FloatParam(this, "pSplit", "Set the speculative decoding split probability")
 
   /** @group param */
   val nGpuLayers = new IntParam(
@@ -108,16 +103,16 @@ trait HasLlamaCppModelProperties {
     new IntParam(this, "mainGpu", "Set the main GPU that is used for scratch and small tensors.")
 
   /** @group param */
-  val tensorSplit = new DoubleArrayParam(
-    this,
-    "tensorSplit",
-    "Set how split tensors should be distributed across GPUs")
+//  val tensorSplit = new DoubleArrayParam(
+//    this,
+//    "tensorSplit",
+//    "Set how split tensors should be distributed across GPUs") // TODO
 
   /** @group param */
-  val grpAttnN = new IntParam(this, "grpAttnN", "Set the group-attention factor")
+//  val grpAttnN = new IntParam(this, "grpAttnN", "Set the group-attention factor")
 
   /** @group param */
-  val grpAttnW = new IntParam(this, "grpAttnW", "Set the group-attention width")
+//  val grpAttnW = new IntParam(this, "grpAttnW", "Set the group-attention width")
 
   /** @group param */
   val ropeFreqBase =
@@ -202,19 +197,19 @@ trait HasLlamaCppModelProperties {
     new Param[String](this, "modelDraft", "Set the draft model for speculative decoding")
 
   /** @group param */
-  val lookupCacheStaticFilePath = new Param[String](
-    this,
-    "lookupCacheStaticFilePath",
-    "Set path to static lookup cache to use for lookup decoding (not updated by generation)")
+//  val lookupCacheStaticFilePath = new Param[String](
+//    this,
+//    "lookupCacheStaticFilePath",
+//    "Set path to static lookup cache to use for lookup decoding (not updated by generation)")
 
-  /** @group param */
-  val lookupCacheDynamicFilePath = new Param[String](
-    this,
-    "lookupCacheDynamicFilePath",
-    "Set path to dynamic lookup cache to use for lookup decoding (updated by generation)")
+//  /** @group param */
+//  val lookupCacheDynamicFilePath = new Param[String](
+//    this,
+//    "lookupCacheDynamicFilePath",
+//    "Set path to dynamic lookup cache to use for lookup decoding (updated by generation)")
 
   /** @group param */
-  val loraAdapters = new StructFeature[Map[String, Float]](this, "loraAdapters")
+//  val loraAdapters = new StructFeature[Map[String, Float]](this, "loraAdapters")
 
   /** @group param */
   val embedding =
@@ -224,11 +219,11 @@ trait HasLlamaCppModelProperties {
   val flashAttention =
     new BooleanParam(this, "flashAttention", "Whether to enable Flash Attention")
 
-  /** @group param */
-  val inputPrefixBos = new BooleanParam(
-    this,
-    "inputPrefixBos",
-    "Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string")
+//  /** @group param */
+//  val inputPrefixBos = new BooleanParam(
+//    this,
+//    "inputPrefixBos",
+//    "This parameter is deprecated and will have not effect.")
 
   /** @group param */
   val useMmap = new BooleanParam(
@@ -272,9 +267,9 @@ trait HasLlamaCppModelProperties {
     *
     * @group setParam
     */
-  def setNThreadsDraft(nThreadsDraft: Int): this.type = {
-    checkEmbeddingMode { set(this.nThreadsDraft, nThreadsDraft) }
-  }
+//  def setNThreadsDraft(nThreadsDraft: Int): this.type = {
+//    checkEmbeddingMode { set(this.nThreadsDraft, nThreadsDraft) }
+//  }
 
   /** Set the number of threads to use during batch and prompt processing
     *
@@ -288,9 +283,9 @@ trait HasLlamaCppModelProperties {
     *
     * @group setParam
     */
-  def setNThreadsBatchDraft(nThreadsBatchDraft: Int): this.type = {
-    checkEmbeddingMode { set(this.nThreadsBatchDraft, nThreadsBatchDraft) }
-  }
+//  def setNThreadsBatchDraft(nThreadsBatchDraft: Int): this.type = {
+//    checkEmbeddingMode { set(this.nThreadsBatchDraft, nThreadsBatchDraft) }
+//  }
 
   /** Set the size of the prompt context
     *
@@ -328,25 +323,25 @@ trait HasLlamaCppModelProperties {
     *
     * @group setParam
     */
-  def setNChunks(nChunks: Int): this.type = {
-    set(this.nChunks, nChunks)
-  }
+//  def setNChunks(nChunks: Int): this.type = {
+//    set(this.nChunks, nChunks)
+//  }
 
   /** Set the number of sequences to decode
     *
     * @group setParam
     */
-  def setNSequences(nSequences: Int): this.type = {
-    set(this.nSequences, nSequences)
-  }
+//  def setNSequences(nSequences: Int): this.type = {
+//    set(this.nSequences, nSequences)
+//  }
 
   /** Set the speculative decoding split probability
     *
     * @group setParam
     */
-  def setPSplit(pSplit: Float): this.type = {
-    checkEmbeddingMode { set(this.pSplit, pSplit) }
-  }
+//  def setPSplit(pSplit: Float): this.type = {
+//    checkEmbeddingMode { set(this.pSplit, pSplit) }
+//  }
 
   /** Set the number of layers to store in VRAM (-1 - use default)
     *
@@ -387,25 +382,25 @@ trait HasLlamaCppModelProperties {
     *
     * @group setParam
     */
-  def setTensorSplit(tensorSplit: Array[Double]): this.type = {
-    set(this.tensorSplit, tensorSplit)
-  }
+//  def setTensorSplit(tensorSplit: Array[Double]): this.type = {
+//    set(this.tensorSplit, tensorSplit)
+//  }
 
   /** Set the group-attention factor
     *
     * @group setParam
     */
-  def setGrpAttnN(grpAttnN: Int): this.type = {
-    set(this.grpAttnN, grpAttnN)
-  }
+//  def setGrpAttnN(grpAttnN: Int): this.type = {
+//    set(this.grpAttnN, grpAttnN)
+//  }
 
   /** Set the group-attention width
     *
     * @group setParam
     */
-  def setGrpAttnW(grpAttnW: Int): this.type = {
-    set(this.grpAttnW, grpAttnW)
-  }
+//  def setGrpAttnW(grpAttnW: Int): this.type = {
+//    set(this.grpAttnW, grpAttnW)
+//  }
 
   /** Set the RoPE base frequency, used by NTK-aware scaling
     *
@@ -488,38 +483,47 @@ trait HasLlamaCppModelProperties {
     val numaStrategies = Array("DISABLED", "DISTRIBUTE", "ISOLATE", "NUMA_CTL", "MIRROR")
     require(
       numaStrategies.contains(numaUpper),
-      s"Invalid NUMA strategy: $numa. " +
+      s"Invalid NUMA strategy: $numaUpper. " +
         s"Valid values are: ${numaStrategies.mkString(", ")}")
     set(this.numaStrategy, numaUpper)
   }
 
   /** Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
     *
-    *   - UNSPECIFIED: Don't use any scaling
+    *   - NONE: Don't use any scaling
     *   - LINEAR: Linear scaling
     *   - YARN: YaRN RoPE scaling
     *
     * @group setParam
     */
   def setRopeScalingType(ropeScalingType: String): this.type = {
-    set(this.ropeScalingType, ropeScalingType)
+    val ropeUpper = ropeScalingType.toUpperCase
+    val ropeScalingTypes = Array("NONE", "LINEAR", "YARN")
+    require(
+      ropeScalingTypes.contains(ropeUpper),
+      s"Invalid RoPE scaling type: $ropeUpper. " +
+        s"Valid values are: ${ropeScalingTypes.mkString(", ")}")
+    set(this.ropeScalingType, ropeUpper)
   }
 
-  /** Set the pooling type for embeddings, use model default if unspecified
+  /** Set the pooling type for embeddings, use model default if unspecified.
     *
-    *   - 0 NONE: Don't use any pooling and return token embeddings (if the model supports it)
-    *   - 1 MEAN: Mean Pooling
-    *   - 2 CLS: Choose the CLS token
-    *   - 3 LAST: Choose the last token
+    * Possible values:
+    *
+    *   - NONE: No pooling
+    *   - MEAN: Mean pooling
+    *   - CLS: Choose the CLS token
+    *   - LAST: Choose the last token
+    *   - RANK: For reranking
     *
     * @group setParam
     */
   def setPoolingType(poolingType: String): this.type = {
     val poolingTypeUpper = poolingType.toUpperCase
-    val poolingTypes = Array("NONE", "MEAN", "CLS", "LAST")
+    val poolingTypes = Array("NONE", "MEAN", "CLS", "LAST", "RANK")
     require(
       poolingTypes.contains(poolingTypeUpper),
-      s"Invalid pooling type: $poolingType. " +
+      s"Invalid pooling type: $poolingTypeUpper. " +
         s"Valid values are: ${poolingTypes.mkString(", ")}")
     set(this.poolingType, poolingTypeUpper)
   }
@@ -536,34 +540,34 @@ trait HasLlamaCppModelProperties {
     *
     * @group setParam
     */
-  def setLookupCacheStaticFilePath(lookupCacheStaticFilePath: String): this.type = {
-    checkEmbeddingMode { set(this.lookupCacheStaticFilePath, lookupCacheStaticFilePath) }
-  }
-
-  /** Set path to dynamic lookup cache to use for lookup decoding (updated by generation)
-    *
-    * @group setParam
-    */
-  def setLookupCacheDynamicFilePath(lookupCacheDynamicFilePath: String): this.type = {
-    checkEmbeddingMode { set(this.lookupCacheDynamicFilePath, lookupCacheDynamicFilePath) }
-  }
+//  def setLookupCacheStaticFilePath(lookupCacheStaticFilePath: String): this.type = {
+//    checkEmbeddingMode { set(this.lookupCacheStaticFilePath, lookupCacheStaticFilePath) }
+//  }
 
+//  /** Set path to dynamic lookup cache to use for lookup decoding (updated by generation)
+//    *
+//    * @group setParam
+//    */
+//  def setLookupCacheDynamicFilePath(lookupCacheDynamicFilePath: String): this.type = {
+//    checkEmbeddingMode { set(this.lookupCacheDynamicFilePath, lookupCacheDynamicFilePath) }
+//  }
+//
   /** Sets paths to lora adapters with user defined scale.
     *
     * @group setParam
     */
-  def setLoraAdapters(loraAdapters: Map[String, Float]): this.type = {
-    set(this.loraAdapters, loraAdapters)
-  }
+//  def setLoraAdapters(loraAdapters: Map[String, Float]): this.type = {
+//    set(this.loraAdapters, loraAdapters)
+//  }
 
   /** Sets paths to lora adapters with user defined scale. (PySpark Override)
     *
     * @group setParam
     */
-  def setLoraAdapters(loraAdapters: java.util.HashMap[String, java.lang.Double]): this.type = {
-    val scalaLoraAdapters = loraAdapters.asScala.map { case (k, v) => k -> v.floatValue() }
-    set(this.loraAdapters, scalaLoraAdapters.toMap)
-  }
+//  def setLoraAdapters(loraAdapters: java.util.HashMap[String, java.lang.Double]): this.type = {
+//    val scalaLoraAdapters = loraAdapters.asScala.map { case (k, v) => k -> v.floatValue() }
+//    set(this.loraAdapters, scalaLoraAdapters.toMap)
+//  }
 
   /** Whether to load model with embedding support
     *
@@ -581,13 +585,13 @@ trait HasLlamaCppModelProperties {
     set(this.flashAttention, flashAttention)
   }
 
-  /** Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string
-    *
-    * @group setParam
-    */
-  def setInputPrefixBos(inputPrefixBos: Boolean): this.type = {
-    set(this.inputPrefixBos, inputPrefixBos)
-  }
+//  /** Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string
+//    *
+//    * @group setParam
+//    */
+//  def setInputPrefixBos(inputPrefixBos: Boolean): this.type = {
+//    set(this.inputPrefixBos, inputPrefixBos)
+//  }
 
   /** Whether to use memory-map model (faster load but may increase pageouts if not using mlock)
     *
@@ -633,13 +637,13 @@ trait HasLlamaCppModelProperties {
   def getNThreads: Int = $(nThreads)
 
   /** @group getParam */
-  def getNThreadsDraft: Int = $(nThreadsDraft)
+//  def getNThreadsDraft: Int = $(nThreadsDraft)
 
   /** @group getParam */
   def getNThreadsBatch: Int = $(nThreadsBatch)
 
   /** @group getParam */
-  def getNThreadsBatchDraft: Int = $(nThreadsBatchDraft)
+//  def getNThreadsBatchDraft: Int = $(nThreadsBatchDraft)
 
   /** @group getParam */
   def getNCtx: Int = $(nCtx)
@@ -654,13 +658,13 @@ trait HasLlamaCppModelProperties {
   def getNDraft: Int = $(nDraft)
 
   /** @group getParam */
-  def getNChunks: Int = $(nChunks)
+//  def getNChunks: Int = $(nChunks)
 
   /** @group getParam */
-  def getNSequences: Int = $(nSequences)
+//  def getNSequences: Int = $(nSequences)
 
   /** @group getParam */
-  def getPSplit: Float = $(pSplit)
+//  def getPSplit: Float = $(pSplit)
 
   /** @group getParam */
   def getNGpuLayers: Int = $(nGpuLayers)
@@ -675,12 +679,12 @@ trait HasLlamaCppModelProperties {
   def getMainGpu: Int = $(mainGpu)
 
   /** @group getParam */
-  def getTensorSplit: Array[Double] = $(tensorSplit)
+//  def getTensorSplit: Array[Double] = $(tensorSplit)
 
-  def getGrpAttnN: Int = $(grpAttnN)
+//  def getGrpAttnN: Int = $(grpAttnN)
 
   /** @group getParam */
-  def getGrpAttnW: Int = $(grpAttnW)
+//  def getGrpAttnW: Int = $(grpAttnW)
 
   /** @group getParam */
   def getRopeFreqBase: Float = $(ropeFreqBase)
@@ -719,13 +723,13 @@ trait HasLlamaCppModelProperties {
   def getModelDraft: String = $(modelDraft)
 
   /** @group getParam */
-  def getLookupCacheStaticFilePath: String = $(lookupCacheStaticFilePath)
+//  def getLookupCacheStaticFilePath: String = $(lookupCacheStaticFilePath)
 
   /** @group getParam */
-  def getLookupCacheDynamicFilePath: String = $(lookupCacheDynamicFilePath)
+//  def getLookupCacheDynamicFilePath: String = $(lookupCacheDynamicFilePath)
 
   /** @group getParam */
-  def getLoraAdapters: Map[String, Float] = $$(loraAdapters)
+//  def getLoraAdapters: Map[String, Float] = $$(loraAdapters)
 
   /** @group getParam */
   def getEmbedding: Boolean = $(embedding)
@@ -733,8 +737,8 @@ trait HasLlamaCppModelProperties {
   /** @group getParam */
   def getFlashAttention: Boolean = $(flashAttention)
 
-  /** @group getParam */
-  def getInputPrefixBos: Boolean = $(inputPrefixBos)
+//  /** @group getParam */
+//  def getInputPrefixBos: Boolean = $(inputPrefixBos)
 
   /** @group getParam */
   def getUseMmap: Boolean = $(useMmap)
@@ -765,89 +769,90 @@ trait HasLlamaCppModelProperties {
     */
   def getMetadata: String = $(metadata)
 
-  def getMetadataMap: Map[String, String] = {
+  def getMetadataMap: Map[String, Map[String, String]] = {
     val metadataJsonString = getMetadata
     if (metadataJsonString.isEmpty) Map.empty
     else {
       implicit val formats: DefaultFormats.type = DefaultFormats
-      JsonMethods.parse(metadataJsonString).extract[Map[String, String]]
+      JsonMethods.parse(metadataJsonString).extract[Map[String, Map[String, String]]]
     }
   }
 
   protected def getModelParameters: ModelParameters = {
-    val modelParameters = new ModelParameters().setContinuousBatching(true) // Always enabled
+    val modelParameters = new ModelParameters().enableContBatching() // Always enabled
 
+    // TODO: rename params? and check which ones are still missing
     if (isDefined(chatTemplate)) modelParameters.setChatTemplate(getChatTemplate)
     if (isDefined(defragmentationThreshold))
-      modelParameters.setDefragmentationThreshold(getDefragmentationThreshold)
-    if (isDefined(embedding)) modelParameters.setEmbedding(getEmbedding)
-    if (isDefined(flashAttention)) modelParameters.setFlashAttention(getFlashAttention)
+      modelParameters.setDefragThold(getDefragmentationThreshold)
+    if (isDefined(embedding)) if (getEmbedding) modelParameters.enableEmbedding()
+    if (isDefined(flashAttention)) if (getFlashAttention) modelParameters.enableFlashAttn()
     if (isDefined(gpuSplitMode))
       modelParameters.setSplitMode(GpuSplitMode.valueOf(getSplitMode))
-    if (isDefined(grpAttnN)) modelParameters.setGrpAttnN(getGrpAttnN)
-    if (isDefined(grpAttnW)) modelParameters.setGrpAttnN(getGrpAttnW)
-    if (isDefined(inputPrefixBos)) modelParameters.setInputPrefixBos(getInputPrefixBos)
-    if (isDefined(lookupCacheDynamicFilePath))
-      modelParameters.setLookupCacheDynamicFilePath(getLookupCacheDynamicFilePath)
-    if (isDefined(lookupCacheStaticFilePath))
-      modelParameters.setLookupCacheStaticFilePath(getLookupCacheStaticFilePath)
+//    if (isDefined(grpAttnN)) modelParameters.setGrpAttnN(getGrpAttnN)
+//    if (isDefined(grpAttnW)) modelParameters.setGrpAttnN(getGrpAttnW)
+//    if (isDefined(inputPrefixBos)) modelParameters.setInputPrefixBos(getInputPrefixBos)
+//    if (isDefined(lookupCacheDynamicFilePath))
+//      modelParameters.setLookupCacheDynamicFilePath(getLookupCacheDynamicFilePath)
+//    if (isDefined(lookupCacheStaticFilePath))
+//      modelParameters.setLookupCacheStaticFilePath(getLookupCacheStaticFilePath)
     if (isDefined(mainGpu)) modelParameters.setMainGpu(getMainGpu)
     if (isDefined(modelDraft)) modelParameters.setModelDraft(getModelDraft)
-    if (isDefined(nBatch)) modelParameters.setNBatch(getNBatch)
-    if (isDefined(nChunks)) modelParameters.setNChunks(getNChunks)
-    if (isDefined(nCtx)) modelParameters.setNCtx(getNCtx)
-    if (isDefined(nDraft)) modelParameters.setNDraft(getNDraft)
-    if (isDefined(nGpuLayers)) modelParameters.setNGpuLayers(getNGpuLayers)
-    if (isDefined(nGpuLayersDraft)) modelParameters.setNGpuLayersDraft(getNGpuLayersDraft)
-    if (isDefined(nSequences)) modelParameters.setNSequences(getNSequences)
-    if (isDefined(nThreads)) modelParameters.setNThreads(getNThreads)
-    if (isDefined(nThreadsBatch)) modelParameters.setNThreadsBatch(getNThreadsBatch)
-    if (isDefined(nThreadsBatchDraft))
-      modelParameters.setNThreadsBatchDraft(getNThreadsBatchDraft)
-    if (isDefined(nThreadsDraft)) modelParameters.setNThreadsDraft(getNThreadsDraft)
-    if (isDefined(nUbatch)) modelParameters.setNUbatch(getNUbatch)
-    if (isDefined(noKvOffload)) modelParameters.setNoKvOffload(getNoKvOffload)
-    if (isDefined(numaStrategy)) modelParameters.setNuma(NumaStrategy.valueOf(getNuma))
-    if (isDefined(pSplit)) modelParameters.setPSplit(getPSplit)
+    if (isDefined(nBatch)) modelParameters.setBatchSize(getNBatch)
+//    if (isDefined(nChunks)) modelParameters.setNChunks(getNChunks)
+    if (isDefined(nCtx)) modelParameters.setCtxSize(getNCtx)
+    if (isDefined(nDraft)) modelParameters.setCtxSizeDraft(getNDraft)
+    if (isDefined(nGpuLayers)) modelParameters.setGpuLayers(getNGpuLayers)
+    if (isDefined(nGpuLayersDraft)) modelParameters.setGpuLayersDraft(getNGpuLayersDraft)
+//    if (isDefined(nSequences)) modelParameters.setNSequencis(getNSequences)
+    if (isDefined(nThreads)) modelParameters.setThreads(getNThreads)
+    if (isDefined(nThreadsBatch)) modelParameters.setThreadsBatch(getNThreadsBatch)
+//    if (isDefined(nThreadsBatchDraft))
+//      modelParameters.setTh(getNThreadsBatchDraft)
+//    if (isDefined(nThreadsDraft)) modelParameters.setNThreadsDraft(getNThreadsDraft)
+    if (isDefined(nUbatch)) modelParameters.setUbatchSize(getNUbatch)
+    if (isDefined(noKvOffload)) if (getNoKvOffload) modelParameters.disableKvOffload()
+    if (isDefined(numaStrategy))
+      modelParameters.setNuma(NumaStrategy.valueOf(getNuma))
+//    if (isDefined(pSplit)) modelParameters.setPSplit(getPSplit)
     if (isDefined(poolingType))
       modelParameters.setPoolingType(PoolingType.valueOf(getPoolingType))
     if (isDefined(ropeFreqBase)) modelParameters.setRopeFreqBase(getRopeFreqBase)
     if (isDefined(ropeFreqScale)) modelParameters.setRopeFreqScale(getRopeFreqScale)
     if (isDefined(ropeScalingType))
-      modelParameters.setRopeScalingType(RopeScalingType.valueOf(getRopeScalingType))
-    if (isDefined(systemPrompt)) modelParameters.setSystemPrompt(getSystemPrompt)
-    if (isDefined(tensorSplit)) modelParameters.setTensorSplit(getTensorSplit.map(_.toFloat))
-    if (isDefined(useMlock)) modelParameters.setUseMlock(getUseMlock)
-    if (isDefined(useMmap)) modelParameters.setUseMmap(getUseMmap)
+      modelParameters.setRopeScaling(RopeScalingType.valueOf(getRopeScalingType))
+    //    if (isDefined(tensorSplit)) modelParameters.setTensorSplit(getTensorSplit.map(_.toFloat))
+    if (isDefined(useMlock)) if (getUseMlock) modelParameters.enableMlock
+    if (isDefined(useMmap)) if (!getUseMmap) modelParameters.disableMmap
     if (isDefined(yarnAttnFactor)) modelParameters.setYarnAttnFactor(getYarnAttnFactor)
     if (isDefined(yarnBetaFast)) modelParameters.setYarnBetaFast(getYarnBetaFast)
     if (isDefined(yarnBetaSlow)) modelParameters.setYarnBetaSlow(getYarnBetaSlow)
     if (isDefined(yarnExtFactor)) modelParameters.setYarnExtFactor(getYarnExtFactor)
     if (isDefined(yarnOrigCtx)) modelParameters.setYarnOrigCtx(getYarnOrigCtx)
-    if (loraAdapters.isSet) {
-      val loraAdaptersMap: mutable.Map[String, java.lang.Float] =
-        mutable.Map(getLoraAdapters.map { case (key, value) =>
-          (key, float2Float(value))
-        }.toSeq: _*)
-      modelParameters.setLoraAdapters(loraAdaptersMap.asJava)
-    } // Need to convert to mutable map first
+//    if (loraAdapters.isSet) {
+//      val loraAdaptersMap: mutable.Map[String, java.lang.Float] =
+//        mutable.Map(getLoraAdapters.map { case (key, value) =>
+//          (key, float2Float(value))
+//        }.toSeq: _*)
+//      modelParameters.addLoraAdapter(loraAdaptersMap.asJava)
+//    } // Need to convert to mutable map first
 
     modelParameters
   }
 
   // ---------------- GPU SUPPORT ----------------
   // Values for automatic GPU support
-  protected val defaultGpuLayers = 1000
-  protected val defaultMainGpu = 0
-
-  // Entrypoint for models. Automatically set GPU support if detected.
-  protected def setGpuSupportIfAvailable(spark: SparkSession): this.type = {
-    val usingGPUJar: Boolean = spark.sparkContext.listJars.exists(_.contains("spark-nlp-gpu"))
-    if (usingGPUJar) {
-      logger.info("Using GPU jar. Offloading all layers to GPU.")
-      setMainGpu(defaultMainGpu)
-      setNGpuLayers(defaultGpuLayers)
-    }
-    this
-  }
+//  protected val defaultGpuLayers = 1000
+//  protected val defaultMainGpu = 0
+//
+//  // Entrypoint for models. Automatically set GPU support if detected.
+//  protected def setGpuSupportIfAvailable(spark: SparkSession): this.type = {
+//    val usingGPUJar: Boolean = spark.sparkContext.listJars.exists(_.contains("spark-nlp-gpu"))
+//    if (usingGPUJar) {
+//      logger.info("Using GPU jar. Offloading all layers to GPU.")
+//      setMainGpu(defaultMainGpu)
+//      setNGpuLayers(defaultGpuLayers)
+//    }
+//    this
+//  }
 }
diff --git a/src/main/scala/com/johnsnowlabs/nlp/PromptAssembler.scala b/src/main/scala/com/johnsnowlabs/nlp/PromptAssembler.scala
index 3a58132965071d..c2774833c84126 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/PromptAssembler.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/PromptAssembler.scala
@@ -1,7 +1,7 @@
 package com.johnsnowlabs.nlp
 
 import com.johnsnowlabs.nlp.AnnotatorType.DOCUMENT
-import com.johnsnowlabs.nlp.llama.LlamaModel
+import com.johnsnowlabs.nlp.llama.LlamaExtensions
 import org.apache.spark.ml.Transformer
 import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap}
 import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
@@ -229,7 +229,7 @@ class PromptAssembler(override val uid: String)
         Array(role, text)
       }.toArray
 
-      val chatString = LlamaModel.applyChatTemplate(template, chatArray, $(addAssistant))
+      val chatString = LlamaExtensions.applyChatTemplate(template, chatArray, $(addAssistant))
       Seq(Annotation(chatString))
     } catch {
       case _: Exception =>
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala
index 4be4c98039058f..970e04c9673188 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala
@@ -18,8 +18,9 @@ package com.johnsnowlabs.nlp.annotators.seq2seq
 import com.johnsnowlabs.ml.gguf.GGUFWrapper
 import com.johnsnowlabs.ml.util.LlamaCPP
 import com.johnsnowlabs.nlp._
-import com.johnsnowlabs.nlp.llama.LlamaModel
+import com.johnsnowlabs.nlp.llama.LlamaExtensions
 import com.johnsnowlabs.nlp.util.io.ResourceHelper
+import de.kherud.llama.{InferenceParameters, LlamaException, LlamaModel}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.ml.util.Identifiable
 import org.apache.spark.sql.SparkSession
@@ -138,9 +139,7 @@ class AutoGGUFModel(override val uid: String)
     if (_model.isEmpty) {
       _model = Some(spark.sparkContext.broadcast(wrapper))
     }
-
-    // Entrypoint for models. Automatically set GPU support if detected.
-    setGpuSupportIfAvailable(spark)
+    this
   }
 
   private[johnsnowlabs] def setEngine(engineName: String): this.type = set(engine, engineName)
@@ -150,8 +149,10 @@ class AutoGGUFModel(override val uid: String)
     useChatTemplate -> true,
     nCtx -> 4096,
     nBatch -> 512,
-    embedding -> false,
-    nPredict -> 100)
+    embedding -> false, // TODO: Disable this?
+    nPredict -> 100,
+    nGpuLayers -> 99,
+    systemPrompt -> "You are a helpful assistant.")
 
   /** Sets the number of parallel processes for decoding. This is an alias for `setBatchSize`.
     *
@@ -177,12 +178,13 @@ class AutoGGUFModel(override val uid: String)
     */
   override def batchAnnotate(batchedAnnotations: Seq[Array[Annotation]]): Seq[Seq[Annotation]] = {
     val annotations: Seq[Annotation] = batchedAnnotations.flatten
+    // TODO: group by doc and sentence
     if (annotations.nonEmpty) {
-      val annotationsText = annotations.map(_.result)
+      val annotationsText = annotations.map { anno => anno.result }
 
       val modelParams =
-        getModelParameters.setNParallel(getBatchSize) // set parallel decoding to batch size
-      val inferenceParams = getInferenceParameters
+        getModelParameters.setParallel(getBatchSize) // set parallel decoding to batch size
+      val inferenceParams: InferenceParameters = getInferenceParameters
 
       val model: LlamaModel = getModelIfNotSet.getSession(modelParams)
 
@@ -190,9 +192,9 @@ class AutoGGUFModel(override val uid: String)
         // Return embeddings in annotation
         val (embeddings: Array[Array[Float]], metadata: Map[String, String]) =
           try {
-            (model.requestBatchEmbeddings(annotationsText.toArray), Map.empty)
+            (annotationsText.map(model.embed), Map.empty)
           } catch {
-            case e: Exception =>
+            case e: LlamaException =>
               logger.error("Error in llama.cpp embeddings", e)
               (
                 Array.fill[Array[Float]](annotationsText.length)(Array.empty),
@@ -212,9 +214,12 @@ class AutoGGUFModel(override val uid: String)
       } else {
         val (completedTexts: Array[String], metadata: Map[String, String]) =
           try {
-            (model.requestBatchCompletion(annotationsText.toArray, inferenceParams), Map.empty)
+            val results: Array[String] = annotationsText.map { t =>
+              LlamaExtensions.complete(model, inferenceParams, getSystemPrompt, t)
+            }.toArray
+            (results, Map.empty)
           } catch {
-            case e: Exception =>
+            case e: LlamaException =>
               logger.error("Error in llama.cpp batch completion", e)
               (Array.fill(annotationsText.length)(""), Map("llamacpp_exception" -> e.getMessage))
           }
@@ -268,7 +273,7 @@ trait ReadAutoGGUFModel {
       .setModelIfNotSet(spark, GGUFWrapper.read(spark, localPath))
       .setEngine(LlamaCPP.name)
 
-    val metadata = LlamaModel.getMetadataFromFile(localPath)
+    val metadata = LlamaExtensions.getMetadataFromFile(localPath)
     if (metadata.nonEmpty) annotatorModel.setMetadata(metadata)
     annotatorModel
   }
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFVisionModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFVisionModel.scala
index 62b4d4903ec97b..65e3f8371c65c8 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFVisionModel.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFVisionModel.scala
@@ -18,8 +18,7 @@ package com.johnsnowlabs.nlp.annotators.seq2seq
 import com.johnsnowlabs.ml.gguf.GGUFWrapperMultiModal
 import com.johnsnowlabs.ml.util.LlamaCPP
 import com.johnsnowlabs.nlp._
-import com.johnsnowlabs.nlp.annotators.cv.util.io.ImageIOUtils
-import com.johnsnowlabs.nlp.llama.{LlamaException, LlamaModel}
+import com.johnsnowlabs.nlp.llama.LlamaExtensions
 import com.johnsnowlabs.nlp.util.io.ResourceHelper
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.ml.util.Identifiable
@@ -158,6 +157,9 @@ class AutoGGUFVisionModel(override val uid: String)
     with HasLlamaCppInferenceProperties
     with HasProtectedParams {
 
+  throw new NotImplementedError(
+    "AutoGGUFVisionModel is not implemented yet for this release. Please use the previous Spark NLP release or AutoGGUFModel for text-only tasks.")
+
   override val inputAnnotatorTypes: Array[AnnotatorType] =
     Array(AnnotatorType.IMAGE, AnnotatorType.DOCUMENT)
   override val outputAnnotatorType: AnnotatorType = AnnotatorType.DOCUMENT
@@ -178,8 +180,6 @@ class AutoGGUFVisionModel(override val uid: String)
       _model = Some(spark.sparkContext.broadcast(wrapper))
     }
 
-    // Entrypoint for models. Automatically set GPU support if detected.
-    setGpuSupportIfAvailable(spark)
     this
   }
 
@@ -235,46 +235,47 @@ class AutoGGUFVisionModel(override val uid: String)
     * sentences that belong to the same original row !! (challenging)
     */
   override def batchAnnotate(
-      batchedAnnotations: Seq[(Annotation, AnnotationImage)]): Seq[Seq[Annotation]] = {
-    if (batchedAnnotations.nonEmpty) {
-
-      // set parallel decoding to batch size
-      val modelParams = getModelParameters.setNParallel(getBatchSize)
-      val model: LlamaModel = getModelIfNotSet.getSession(modelParams)
-
-      val (prompts, base64EncodedImages) = batchedAnnotations.unzip match {
-        case (promptAnnotations, imageAnnotations) =>
-          (
-            promptAnnotations.map(_.result).toArray,
-            imageAnnotations
-              .map(imgAnno => ImageIOUtils.encodeImageBase64(imgAnno.result))
-              .toArray)
-      }
-
-      val (completedTexts: Array[String], metadata: Map[String, String]) =
-        try {
-          (
-            model.requestBatchImageCompletion(
-              prompts,
-              base64EncodedImages,
-              getInferenceParameters),
-            Map.empty)
-        } catch {
-          case e: LlamaException =>
-            logger.error("Error in llama.cpp image batch completion", e)
-            (Array.fill(prompts.length)(""), Map("llamacpp_exception" -> e.getMessage))
-        }
-
-      val result: Seq[Seq[Annotation]] =
-        batchedAnnotations.zip(completedTexts).map {
-          case ((textAnnotation: Annotation, imageAnnotation: AnnotationImage), text) =>
-            val totalMetadata =
-              textAnnotation.metadata ++ imageAnnotation.metadata ++ metadata
-            Seq(new Annotation(outputAnnotatorType, 0, text.length - 1, text, totalMetadata))
-        }
-      result
-    } else Seq(Seq.empty[Annotation])
-  }
+      batchedAnnotations: Seq[(Annotation, AnnotationImage)]): Seq[Seq[Annotation]] = ???
+//  {
+//    if (batchedAnnotations.nonEmpty) {
+//
+//      // set parallel decoding to batch size
+//      val modelParams = getModelParameters.setParallel(getBatchSize)
+//      val model: LlamaModel = getModelIfNotSet.getSession(modelParams)
+//
+//      val (prompts, base64EncodedImages) = batchedAnnotations.unzip match {
+//        case (promptAnnotations, imageAnnotations) =>
+//          (
+//            promptAnnotations.map(_.result).toArray,
+//            imageAnnotations
+//              .map(imgAnno => ImageIOUtils.encodeImageBase64(imgAnno.result))
+//              .toArray)
+//      }
+//
+//      val (completedTexts: Array[String], metadata: Map[String, String]) =
+//        try {
+//          (
+//            model.requestBatchImageCompletion(
+//              prompts,
+//              base64EncodedImages,
+//              getInferenceParameters),
+//            Map.empty)
+//        } catch {
+//          case e: LlamaException =>
+//            logger.error("Error in llama.cpp image batch completion", e)
+//            (Array.fill(prompts.length)(""), Map("llamacpp_exception" -> e.getMessage))
+//        }
+//
+//      val result: Seq[Seq[Annotation]] =
+//        batchedAnnotations.zip(completedTexts).map {
+//          case ((textAnnotation: Annotation, imageAnnotation: AnnotationImage), text) =>
+//            val totalMetadata =
+//              textAnnotation.metadata ++ imageAnnotation.metadata ++ metadata
+//            Seq(new Annotation(outputAnnotatorType, 0, text.length - 1, text, totalMetadata))
+//        }
+//      result
+//    } else Seq(Seq.empty[Annotation])
+//  }
 }
 
 trait ReadablePretrainedAutoGGUFVisionModel
@@ -322,7 +323,7 @@ trait ReadAutoGGUFVisionModel {
       .setEngine(LlamaCPP.name)
 
     // TODO mmproj metadata necessary?
-    val metadata = LlamaModel.getMetadataFromFile(localPathModel)
+    val metadata = LlamaExtensions.getMetadataFromFile(localPathModel)
     if (metadata.nonEmpty) annotatorModel.setMetadata(metadata)
     annotatorModel
   }
diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddings.scala
index 389166a7ad10f6..06ae1053f24a8d 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddings.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddings.scala
@@ -18,7 +18,8 @@ package com.johnsnowlabs.nlp.embeddings
 import com.johnsnowlabs.ml.gguf.GGUFWrapper
 import com.johnsnowlabs.ml.util.LlamaCPP
 import com.johnsnowlabs.nlp._
-import com.johnsnowlabs.nlp.llama.LlamaModel
+import com.johnsnowlabs.nlp.llama.LlamaExtensions
+import de.kherud.llama.LlamaModel
 import com.johnsnowlabs.nlp.util.io.ResourceHelper
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.ml.util.Identifiable
@@ -130,7 +131,8 @@ class AutoGGUFEmbeddings(override val uid: String)
       _model = Some(spark.sparkContext.broadcast(wrapper))
     }
 
-    setGpuSupportIfAvailable(spark)
+    this
+//    setGpuSupportIfAvailable(spark)
   }
 
   private[johnsnowlabs] def setEngine(engineName: String): this.type = set(engine, engineName)
@@ -140,7 +142,8 @@ class AutoGGUFEmbeddings(override val uid: String)
     embedding -> true,
     poolingType -> "MEAN",
     nCtx -> 4096,
-    nBatch -> 512)
+    nBatch -> 512,
+    nGpuLayers -> 99)
 
   /** Sets the number of parallel processes for decoding. This is an alias for `setBatchSize`.
     *
@@ -172,7 +175,7 @@ class AutoGGUFEmbeddings(override val uid: String)
     if (annotations.nonEmpty) {
 
       val modelParams =
-        getModelParameters.setNParallel(getBatchSize) // set parallel decoding to batch size
+        getModelParameters.setParallel(getBatchSize) // set parallel decoding to batch size
 
       val model: LlamaModel = getModelIfNotSet.getSession(modelParams)
 
@@ -181,7 +184,8 @@ class AutoGGUFEmbeddings(override val uid: String)
       // Return embeddings in annotation
       val (embeddings: Array[Array[Float]], metadata: Map[String, String]) =
         try {
-          (model.requestBatchEmbeddings(annotationsText.toArray), Map.empty)
+          val result: Array[Array[Float]] = annotationsText.map(model.embed).toArray
+          (result, Map.empty)
         } catch {
           case e: Exception =>
             logger.error("Error in llama.cpp embeddings", e)
@@ -241,7 +245,7 @@ trait ReadAutoGGUFEmbeddings {
       .setModelIfNotSet(spark, GGUFWrapper.read(spark, localPath))
       .setEngine(LlamaCPP.name)
 
-    val metadata = LlamaModel.getMetadataFromFile(localPath)
+    val metadata = LlamaExtensions.getMetadataFromFile(localPath)
     if (metadata.nonEmpty) annotatorModel.setMetadata(metadata)
     annotatorModel
   }
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala
index 01cb289903550d..96be045eef66a3 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala
@@ -87,23 +87,23 @@ class AutoGGUFModelTest extends AnyFlatSpec {
   it should "accept all parameters that are settable" taggedAs SlowTest in {
     // Model Parameters
     model.setNThreads(8)
-    model.setNThreadsDraft(8)
+//    model.setNThreadsDraft(8)
     model.setNThreadsBatch(8)
-    model.setNThreadsBatchDraft(8)
+//    model.setNThreadsBatchDraft(8)
     model.setNCtx(512)
     model.setNBatch(32)
     model.setNUbatch(32)
     model.setNDraft(5)
-    model.setNChunks(-1)
-    model.setNSequences(1)
-    model.setPSplit(0.1f)
+//    model.setNChunks(-1)
+//    model.setNSequences(1)
+//    model.setPSplit(0.1f)
     model.setNGpuLayers(99)
     model.setNGpuLayersDraft(99)
     model.setGpuSplitMode("NONE")
     model.setMainGpu(0)
-    model.setTensorSplit(Array[Double]())
-    model.setGrpAttnN(1)
-    model.setGrpAttnW(512)
+//    model.setTensorSplit(Array[Double]())
+//    model.setGrpAttnN(1)
+//    model.setGrpAttnW(512)
     model.setRopeFreqBase(1.0f)
     model.setRopeFreqScale(1.0f)
     model.setYarnExtFactor(1.0f)
@@ -113,14 +113,14 @@ class AutoGGUFModelTest extends AnyFlatSpec {
     model.setYarnOrigCtx(0)
     model.setDefragmentationThreshold(-1.0f)
     model.setNumaStrategy("DISTRIBUTE")
-    model.setRopeScalingType("UNSPECIFIED")
-    model.setPoolingType("UNSPECIFIED")
+    model.setRopeScalingType("NONE")
+    model.setPoolingType("NONE")
     model.setModelDraft("")
-    model.setLookupCacheStaticFilePath("/tmp/sparknlp-llama-cpp-cache")
-    model.setLookupCacheDynamicFilePath("/tmp/sparknlp-llama-cpp-cache")
+//    model.setLookupCacheStaticFilePath("/tmp/sparknlp-llama-cpp-cache")
+//    model.setLookupCacheDynamicFilePath("/tmp/sparknlp-llama-cpp-cache")
     model.setEmbedding(false)
     model.setFlashAttention(false)
-    model.setInputPrefixBos(false)
+//    model.setInputPrefixBos(false)
     model.setUseMmap(false)
     model.setUseMlock(false)
     model.setNoKvOffload(false)
@@ -130,7 +130,7 @@ class AutoGGUFModelTest extends AnyFlatSpec {
     // Inference Parameters
     model.setInputPrefix("")
     model.setInputSuffix("")
-    model.setCachePrompt(false)
+    model.setCachePrompt(true)
     model.setNPredict(-1)
     model.setTopK(40)
     model.setTopP(0.9f)
@@ -164,7 +164,7 @@ class AutoGGUFModelTest extends AnyFlatSpec {
     // Struct Features
     model.setTokenIdBias(Map(0 -> 0.0f, 1 -> 0.0f))
     model.setTokenBias(Map("!" -> 0.0f, "?" -> 0.0f))
-    model.setLoraAdapters(Map(" " -> 0.0f))
+//    model.setLoraAdapters(Map(" " -> 0.0f))
 
     lazy val pipeline = new Pipeline().setStages(Array(documentAssembler, model))
 
diff --git a/src/test/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddingsTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddingsTestSpec.scala
index f9a90635d6ac2d..0c05df61fa7c39 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddingsTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddingsTestSpec.scala
@@ -41,7 +41,8 @@ class AutoGGUFEmbeddingsTestSpec extends AnyFlatSpec {
     .setBatchSize(4)
     .setPoolingType(poolingType)
     .setNCtx(8192)
-  def pipeline(embedModel: AutoGGUFEmbeddings = model("MEAN")) =
+
+  def pipeline(embedModel: AutoGGUFEmbeddings = model("MEAN")): Pipeline =
     new Pipeline().setStages(Array(documentAssembler, embedModel))
 
   it should "produce embeddings" taggedAs SlowTest in {
@@ -110,8 +111,8 @@ class AutoGGUFEmbeddingsTestSpec extends AnyFlatSpec {
   it should "embed long text" taggedAs SlowTest in {
     val result = pipeline(
       model("MEAN")
-        .setNUbatch(2048)
-        .setNBatch(2048)).fit(longData).transform(longData)
+        .setNUbatch(4096)
+        .setNBatch(4096)).fit(longData).transform(longData)
     val collected = Annotation.collect(result, "embeddings")
     assert(collected.length == longDataCopies, "Should return the same number of rows")