diff --git a/build.sbt b/build.sbt index 2e2853e89eadd1..5d6bc3cb7de7a0 100644 --- a/build.sbt +++ b/build.sbt @@ -6,7 +6,7 @@ name := getPackageName(is_silicon, is_gpu, is_aarch64) organization := "com.johnsnowlabs.nlp" -version := "6.0.5" +version := "6.1.0-rc1" (ThisBuild / scalaVersion) := scalaVer diff --git a/project/Dependencies.scala b/project/Dependencies.scala index 22b306b0354c9c..cfdcba4ca0ada7 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -128,11 +128,11 @@ object Dependencies { val azureIdentity = "com.azure" % "azure-identity" % azureIdentityVersion % Provided val azureStorage = "com.azure" % "azure-storage-blob" % azureStorageVersion % Provided - val llamaCppVersion = "0.1.6" - val llamaCppCPU = "com.johnsnowlabs.nlp" %% "jsl-llamacpp-cpu" % llamaCppVersion - val llamaCppGPU = "com.johnsnowlabs.nlp" %% "jsl-llamacpp-gpu" % llamaCppVersion - val llamaCppSilicon = "com.johnsnowlabs.nlp" %% "jsl-llamacpp-silicon" % llamaCppVersion - val llamaCppAarch64 = "com.johnsnowlabs.nlp" %% "jsl-llamacpp-aarch64" % llamaCppVersion + val llamaCppVersion = "1.0.1" + val llamaCppCPU = "com.johnsnowlabs.nlp" % "jsl-llamacpp-cpu" % llamaCppVersion + val llamaCppGPU = "com.johnsnowlabs.nlp" % "jsl-llamacpp-gpu" % llamaCppVersion + val llamaCppSilicon = "com.johnsnowlabs.nlp" % "jsl-llamacpp-silicon" % llamaCppVersion + val llamaCppAarch64 = "com.johnsnowlabs.nlp" % "jsl-llamacpp-aarch64" % llamaCppVersion val jsoupVersion = "1.18.2" diff --git a/python/sparknlp/annotator/seq2seq/auto_gguf_model.py b/python/sparknlp/annotator/seq2seq/auto_gguf_model.py index 37c96319564782..2d01c29fea57fc 100755 --- a/python/sparknlp/annotator/seq2seq/auto_gguf_model.py +++ b/python/sparknlp/annotator/seq2seq/auto_gguf_model.py @@ -253,7 +253,9 @@ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFMo nCtx=4096, nBatch=512, embedding=False, - nPredict=100 + nPredict=100, + nGpuLayers=99, + systemPrompt="You are a helpful assistant." ) @staticmethod diff --git a/python/sparknlp/common/properties.py b/python/sparknlp/common/properties.py index a134a86ac562eb..4cfbf70c5f0dd8 100644 --- a/python/sparknlp/common/properties.py +++ b/python/sparknlp/common/properties.py @@ -765,14 +765,14 @@ class HasLlamaCppProperties: # -------- MODEl PARAMETERS -------- nThreads = Param(Params._dummy(), "nThreads", "Set the number of threads to use during generation", typeConverter=TypeConverters.toInt) - nThreadsDraft = Param(Params._dummy(), "nThreadsDraft", "Set the number of threads to use during draft generation", - typeConverter=TypeConverters.toInt) + # nThreadsDraft = Param(Params._dummy(), "nThreadsDraft", "Set the number of threads to use during draft generation", + # typeConverter=TypeConverters.toInt) nThreadsBatch = Param(Params._dummy(), "nThreadsBatch", "Set the number of threads to use during batch and prompt processing", typeConverter=TypeConverters.toInt) - nThreadsBatchDraft = Param(Params._dummy(), "nThreadsBatchDraft", - "Set the number of threads to use during batch and prompt processing", - typeConverter=TypeConverters.toInt) + # nThreadsBatchDraft = Param(Params._dummy(), "nThreadsBatchDraft", + # "Set the number of threads to use during batch and prompt processing", + # typeConverter=TypeConverters.toInt) nCtx = Param(Params._dummy(), "nCtx", "Set the size of the prompt context", typeConverter=TypeConverters.toInt) nBatch = Param(Params._dummy(), "nBatch", "Set the logical batch size for prompt processing (must be >=32 to use BLAS)", @@ -782,12 +782,12 @@ class HasLlamaCppProperties: typeConverter=TypeConverters.toInt) nDraft = Param(Params._dummy(), "nDraft", "Set the number of tokens to draft for speculative decoding", typeConverter=TypeConverters.toInt) - nChunks = Param(Params._dummy(), "nChunks", "Set the maximal number of chunks to process", - typeConverter=TypeConverters.toInt) - nSequences = Param(Params._dummy(), "nSequences", "Set the number of sequences to decode", - typeConverter=TypeConverters.toInt) - pSplit = Param(Params._dummy(), "pSplit", "Set the speculative decoding split probability", - typeConverter=TypeConverters.toFloat) + # nChunks = Param(Params._dummy(), "nChunks", "Set the maximal number of chunks to process", + # typeConverter=TypeConverters.toInt) + # nSequences = Param(Params._dummy(), "nSequences", "Set the number of sequences to decode", + # typeConverter=TypeConverters.toInt) + # pSplit = Param(Params._dummy(), "pSplit", "Set the speculative decoding split probability", + # typeConverter=TypeConverters.toFloat) nGpuLayers = Param(Params._dummy(), "nGpuLayers", "Set the number of layers to store in VRAM (-1 - use default)", typeConverter=TypeConverters.toInt) nGpuLayersDraft = Param(Params._dummy(), "nGpuLayersDraft", @@ -802,10 +802,10 @@ class HasLlamaCppProperties: typeConverter=TypeConverters.toString) mainGpu = Param(Params._dummy(), "mainGpu", "Set the main GPU that is used for scratch and small tensors.", typeConverter=TypeConverters.toInt) - tensorSplit = Param(Params._dummy(), "tensorSplit", "Set how split tensors should be distributed across GPUs", - typeConverter=TypeConverters.toListFloat) - grpAttnN = Param(Params._dummy(), "grpAttnN", "Set the group-attention factor", typeConverter=TypeConverters.toInt) - grpAttnW = Param(Params._dummy(), "grpAttnW", "Set the group-attention width", typeConverter=TypeConverters.toInt) + # tensorSplit = Param(Params._dummy(), "tensorSplit", "Set how split tensors should be distributed across GPUs", + # typeConverter=TypeConverters.toListFloat) + # grpAttnN = Param(Params._dummy(), "grpAttnN", "Set the group-attention factor", typeConverter=TypeConverters.toInt) + # grpAttnW = Param(Params._dummy(), "grpAttnW", "Set the group-attention width", typeConverter=TypeConverters.toInt) ropeFreqBase = Param(Params._dummy(), "ropeFreqBase", "Set the RoPE base frequency, used by NTK-aware scaling", typeConverter=TypeConverters.toFloat) ropeFreqScale = Param(Params._dummy(), "ropeFreqScale", @@ -837,7 +837,7 @@ class HasLlamaCppProperties: typeConverter=TypeConverters.toString) # Set the RoPE frequency scaling method, defaults to linear unless specified by the model. # - # - UNSPECIFIED: Don't use any scaling + # - NONE: Don't use any scaling # - LINEAR: Linear scaling # - YARN: YaRN RoPE scaling ropeScalingType = Param(Params._dummy(), "ropeScalingType", @@ -848,26 +848,28 @@ class HasLlamaCppProperties: # - 0 NONE: Don't use any pooling # - 1 MEAN: Mean Pooling # - 2 CLS: CLS Pooling + # - 3 LAST: Last token pooling + # - 4 RANK: For reranked models poolingType = Param(Params._dummy(), "poolingType", "Set the pooling type for embeddings, use model default if unspecified", typeConverter=TypeConverters.toString) modelDraft = Param(Params._dummy(), "modelDraft", "Set the draft model for speculative decoding", typeConverter=TypeConverters.toString) modelAlias = Param(Params._dummy(), "modelAlias", "Set a model alias", typeConverter=TypeConverters.toString) - lookupCacheStaticFilePath = Param(Params._dummy(), "lookupCacheStaticFilePath", - "Set path to static lookup cache to use for lookup decoding (not updated by generation)", - typeConverter=TypeConverters.toString) - lookupCacheDynamicFilePath = Param(Params._dummy(), "lookupCacheDynamicFilePath", - "Set path to dynamic lookup cache to use for lookup decoding (updated by generation)", - typeConverter=TypeConverters.toString) + # lookupCacheStaticFilePath = Param(Params._dummy(), "lookupCacheStaticFilePath", + # "Set path to static lookup cache to use for lookup decoding (not updated by generation)", + # typeConverter=TypeConverters.toString) + # lookupCacheDynamicFilePath = Param(Params._dummy(), "lookupCacheDynamicFilePath", + # "Set path to dynamic lookup cache to use for lookup decoding (updated by generation)", + # typeConverter=TypeConverters.toString) # loraAdapters = new StructFeature[Map[String, Float]](this, "loraAdapters") embedding = Param(Params._dummy(), "embedding", "Whether to load model with embedding support", typeConverter=TypeConverters.toBoolean) flashAttention = Param(Params._dummy(), "flashAttention", "Whether to enable Flash Attention", typeConverter=TypeConverters.toBoolean) - inputPrefixBos = Param(Params._dummy(), "inputPrefixBos", - "Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string", - typeConverter=TypeConverters.toBoolean) + # inputPrefixBos = Param(Params._dummy(), "inputPrefixBos", + # "Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string", + # typeConverter=TypeConverters.toBoolean) useMmap = Param(Params._dummy(), "useMmap", "Whether to use memory-map model (faster load but may increase pageouts if not using mlock)", typeConverter=TypeConverters.toBoolean) @@ -948,17 +950,17 @@ def setNThreads(self, nThreads: int): """Set the number of threads to use during generation""" return self._set(nThreads=nThreads) - def setNThreadsDraft(self, nThreadsDraft: int): - """Set the number of threads to use during draft generation""" - return self._set(nThreadsDraft=nThreadsDraft) + # def setNThreadsDraft(self, nThreadsDraft: int): + # """Set the number of threads to use during draft generation""" + # return self._set(nThreadsDraft=nThreadsDraft) def setNThreadsBatch(self, nThreadsBatch: int): """Set the number of threads to use during batch and prompt processing""" return self._set(nThreadsBatch=nThreadsBatch) - def setNThreadsBatchDraft(self, nThreadsBatchDraft: int): - """Set the number of threads to use during batch and prompt processing""" - return self._set(nThreadsBatchDraft=nThreadsBatchDraft) + # def setNThreadsBatchDraft(self, nThreadsBatchDraft: int): + # """Set the number of threads to use during batch and prompt processing""" + # return self._set(nThreadsBatchDraft=nThreadsBatchDraft) def setNCtx(self, nCtx: int): """Set the size of the prompt context""" @@ -976,17 +978,17 @@ def setNDraft(self, nDraft: int): """Set the number of tokens to draft for speculative decoding""" return self._set(nDraft=nDraft) - def setNChunks(self, nChunks: int): - """Set the maximal number of chunks to process""" - return self._set(nChunks=nChunks) + # def setNChunks(self, nChunks: int): + # """Set the maximal number of chunks to process""" + # return self._set(nChunks=nChunks) - def setNSequences(self, nSequences: int): - """Set the number of sequences to decode""" - return self._set(nSequences=nSequences) + # def setNSequences(self, nSequences: int): + # """Set the number of sequences to decode""" + # return self._set(nSequences=nSequences) - def setPSplit(self, pSplit: float): - """Set the speculative decoding split probability""" - return self._set(pSplit=pSplit) + # def setPSplit(self, pSplit: float): + # """Set the speculative decoding split probability""" + # return self._set(pSplit=pSplit) def setNGpuLayers(self, nGpuLayers: int): """Set the number of layers to store in VRAM (-1 - use default)""" @@ -1004,17 +1006,17 @@ def setMainGpu(self, mainGpu: int): """Set the main GPU that is used for scratch and small tensors.""" return self._set(mainGpu=mainGpu) - def setTensorSplit(self, tensorSplit: List[float]): - """Set how split tensors should be distributed across GPUs""" - return self._set(tensorSplit=tensorSplit) + # def setTensorSplit(self, tensorSplit: List[float]): + # """Set how split tensors should be distributed across GPUs""" + # return self._set(tensorSplit=tensorSplit) - def setGrpAttnN(self, grpAttnN: int): - """Set the group-attention factor""" - return self._set(grpAttnN=grpAttnN) + # def setGrpAttnN(self, grpAttnN: int): + # """Set the group-attention factor""" + # return self._set(grpAttnN=grpAttnN) - def setGrpAttnW(self, grpAttnW: int): - """Set the group-attention width""" - return self._set(grpAttnW=grpAttnW) + # def setGrpAttnW(self, grpAttnW: int): + # """Set the group-attention width""" + # return self._set(grpAttnW=grpAttnW) def setRopeFreqBase(self, ropeFreqBase: float): """Set the RoPE base frequency, used by NTK-aware scaling""" @@ -1049,7 +1051,16 @@ def setDefragmentationThreshold(self, defragmentationThreshold: float): return self._set(defragmentationThreshold=defragmentationThreshold) def setNumaStrategy(self, numaStrategy: str): - """Set optimization strategies that help on some NUMA systems (if available)""" + """Set optimization strategies that help on some NUMA systems (if available) + + Possible values: + + - DISABLED: No NUMA optimizations + - DISTRIBUTE: spread execution evenly over all + - ISOLATE: only spawn threads on CPUs on the node that execution started on + - NUMA_CTL: use the CPU map provided by numactl + - MIRROR: Mirrors the model across NUMA nodes + """ numaUpper = numaStrategy.upper() numaStrategies = ["DISABLED", "DISTRIBUTE", "ISOLATE", "NUMA_CTL", "MIRROR"] if numaUpper not in numaStrategies: @@ -1060,13 +1071,36 @@ def setNumaStrategy(self, numaStrategy: str): return self._set(numaStrategy=numaStrategy) def setRopeScalingType(self, ropeScalingType: str): - """Set the RoPE frequency scaling method, defaults to linear unless specified by the model""" - return self._set(ropeScalingType=ropeScalingType) + """Set the RoPE frequency scaling method, defaults to linear unless specified by the model. + + Possible values: + + - NONE: Don't use any scaling + - LINEAR: Linear scaling + - YARN: YaRN RoPE scaling + """ + ropeScalingTypeUpper = ropeScalingType.upper() + ropeScalingTypes = ["NONE", "LINEAR", "YARN"] + if ropeScalingTypeUpper not in ropeScalingTypes: + raise ValueError( + f"Invalid RoPE scaling type: {ropeScalingType}. " + + f"Valid values are: {ropeScalingTypes}" + ) + return self._set(ropeScalingType=ropeScalingTypeUpper) def setPoolingType(self, poolingType: str): - """Set the pooling type for embeddings, use model default if unspecified""" + """Set the pooling type for embeddings, use model default if unspecified + + Possible values: + + - 0 NONE: Don't use any pooling + - 1 MEAN: Mean Pooling + - 2 CLS: CLS Pooling + - 3 LAST: Last token pooling + - 4 RANK: For reranked models + """ poolingTypeUpper = poolingType.upper() - poolingTypes = ["NONE", "MEAN", "CLS", "LAST"] + poolingTypes = ["NONE", "MEAN", "CLS", "LAST", "RANK"] if poolingTypeUpper not in poolingTypes: raise ValueError( f"Invalid pooling type: {poolingType}. " @@ -1082,13 +1116,13 @@ def setModelAlias(self, modelAlias: str): """Set a model alias""" return self._set(modelAlias=modelAlias) - def setLookupCacheStaticFilePath(self, lookupCacheStaticFilePath: str): - """Set path to static lookup cache to use for lookup decoding (not updated by generation)""" - return self._set(lookupCacheStaticFilePath=lookupCacheStaticFilePath) + # def setLookupCacheStaticFilePath(self, lookupCacheStaticFilePath: str): + # """Set path to static lookup cache to use for lookup decoding (not updated by generation)""" + # return self._set(lookupCacheStaticFilePath=lookupCacheStaticFilePath) - def setLookupCacheDynamicFilePath(self, lookupCacheDynamicFilePath: str): - """Set path to dynamic lookup cache to use for lookup decoding (updated by generation)""" - return self._set(lookupCacheDynamicFilePath=lookupCacheDynamicFilePath) + # def setLookupCacheDynamicFilePath(self, lookupCacheDynamicFilePath: str): + # """Set path to dynamic lookup cache to use for lookup decoding (updated by generation)""" + # return self._set(lookupCacheDynamicFilePath=lookupCacheDynamicFilePath) def setEmbedding(self, embedding: bool): """Whether to load model with embedding support""" @@ -1098,9 +1132,9 @@ def setFlashAttention(self, flashAttention: bool): """Whether to enable Flash Attention""" return self._set(flashAttention=flashAttention) - def setInputPrefixBos(self, inputPrefixBos: bool): - """Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string""" - return self._set(inputPrefixBos=inputPrefixBos) + # def setInputPrefixBos(self, inputPrefixBos: bool): + # """Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string""" + # return self._set(inputPrefixBos=inputPrefixBos) def setUseMmap(self, useMmap: bool): """Whether to use memory-map model (faster load but may increase pageouts if not using mlock)""" @@ -1260,9 +1294,9 @@ def setTokenBias(self, tokenBias: Dict[str, float]): """Set token id bias""" return self._call_java("setTokenBias", tokenBias) - def setLoraAdapters(self, loraAdapters: Dict[str, float]): - """Set LoRA adapters with their scaling factors""" - return self._call_java("setLoraAdapters", loraAdapters) + # def setLoraAdapters(self, loraAdapters: Dict[str, float]): + # """Set LoRA adapters with their scaling factors""" + # return self._call_java("setLoraAdapters", loraAdapters) def getMetadata(self): """Gets the metadata of the model""" diff --git a/python/test/annotator/embeddings/auto_gguf_embeddings_test.py b/python/test/annotator/embeddings/auto_gguf_embeddings_test.py index 0f9ebe0b4ea247..a8c7fa01838f37 100644 --- a/python/test/annotator/embeddings/auto_gguf_embeddings_test.py +++ b/python/test/annotator/embeddings/auto_gguf_embeddings_test.py @@ -153,8 +153,8 @@ def runTest(self): .setInputCols("document") .setOutputCol("embeddings") .setBatchSize(4) - .setNUbatch(2048) - .setNBatch(2048) + .setNUbatch(4096) + .setNBatch(4096) ) pipeline = Pipeline().setStages([self.document_assembler, model]) results = pipeline.fit(self.long_data).transform(self.long_data) diff --git a/python/test/annotator/seq2seq/auto_gguf_model_test.py b/python/test/annotator/seq2seq/auto_gguf_model_test.py index cb014591ae33bc..e34c3a956ff2bb 100644 --- a/python/test/annotator/seq2seq/auto_gguf_model_test.py +++ b/python/test/annotator/seq2seq/auto_gguf_model_test.py @@ -49,7 +49,7 @@ def runTest(self): .setOutputCol("completions") .setBatchSize(4) .setNPredict(20) - .setNGpuLayers(5) + .setNGpuLayers(99) .setTemperature(0.4) .setTopK(40) .setTopP(0.9) @@ -78,7 +78,7 @@ def runTest(self): DocumentAssembler().setInputCol("text").setOutputCol("document") ) - model = ( + model: AutoGGUFModel = ( AutoGGUFModel.pretrained() .setInputCols("document") .setOutputCol("completions") @@ -87,23 +87,23 @@ def runTest(self): # Model Parameters model.setNThreads(8) - model.setNThreadsDraft(8) + # model.setNThreadsDraft(8) model.setNThreadsBatch(8) - model.setNThreadsBatchDraft(8) + # model.setNThreadsBatchDraft(8) model.setNCtx(512) model.setNBatch(32) model.setNUbatch(32) model.setNDraft(5) - model.setNChunks(-1) - model.setNSequences(1) - model.setPSplit(0.1) + # model.setNChunks(-1) + # model.setNSequences(1) + # model.setPSplit(0.1) model.setNGpuLayers(99) model.setNGpuLayersDraft(99) model.setGpuSplitMode("NONE") model.setMainGpu(0) - model.setTensorSplit([]) - model.setGrpAttnN(1) - model.setGrpAttnW(512) + # model.setTensorSplit([]) + # model.setGrpAttnN(1) + # model.setGrpAttnW(512) model.setRopeFreqBase(1.0) model.setRopeFreqScale(1.0) model.setYarnExtFactor(1.0) @@ -113,14 +113,14 @@ def runTest(self): model.setYarnOrigCtx(0) model.setDefragmentationThreshold(-1.0) model.setNumaStrategy("DISTRIBUTE") - model.setRopeScalingType("UNSPECIFIED") + model.setRopeScalingType("NONE") model.setPoolingType("NONE") model.setModelDraft("") - model.setLookupCacheStaticFilePath("/tmp/sparknlp-llama-cpp-cache") - model.setLookupCacheDynamicFilePath("/tmp/sparknlp-llama-cpp-cache") + # model.setLookupCacheStaticFilePath("/tmp/sparknlp-llama-cpp-cache") + # model.setLookupCacheDynamicFilePath("/tmp/sparknlp-llama-cpp-cache") model.setEmbedding(False) model.setFlashAttention(False) - model.setInputPrefixBos(False) + # model.setInputPrefixBos(False) model.setUseMmap(False) model.setUseMlock(False) model.setNoKvOffload(False) @@ -164,7 +164,7 @@ def runTest(self): # Special PySpark Parameters (Scala StructFeatures) model.setTokenIdBias({0: 0.0, 1: 0.0}) model.setTokenBias({"!": 0.0, "?": 0.0}) - model.setLoraAdapters({" ": 0.0}) + # model.setLoraAdapters({" ": 0.0}) pipeline = Pipeline().setStages([document_assembler, model]) results = pipeline.fit(data).transform(data) diff --git a/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapper.scala b/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapper.scala index 6f68ead3a51ef0..1d65a8daa567d6 100644 --- a/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapper.scala +++ b/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapper.scala @@ -15,8 +15,8 @@ */ package com.johnsnowlabs.ml.gguf -import com.johnsnowlabs.nlp.llama.{LlamaModel, ModelParameters} import com.johnsnowlabs.nlp.util.io.ResourceHelper +import de.kherud.llama.{LlamaModel, ModelParameters} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.SparkFiles import org.apache.spark.sql.SparkSession @@ -42,7 +42,7 @@ class GGUFWrapper(var modelFileName: String, var modelFolder: String) extends Se val modelFilePath = SparkFiles.get(modelFileName) if (Paths.get(modelFilePath).toFile.exists()) { - modelParameters.setModelFilePath(modelFilePath) + modelParameters.setModel(modelFilePath) llamaModel = GGUFWrapper.withSafeGGUFModelLoader(modelParameters) } else throw new IllegalStateException( diff --git a/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapperMultiModal.scala b/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapperMultiModal.scala index 89eb8f517360f2..4f8fef32dd0904 100644 --- a/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapperMultiModal.scala +++ b/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapperMultiModal.scala @@ -15,7 +15,7 @@ */ package com.johnsnowlabs.ml.gguf -import com.johnsnowlabs.nlp.llama.{LlamaModel, ModelParameters} +import de.kherud.llama.{LlamaModel, ModelParameters} import com.johnsnowlabs.nlp.util.io.ResourceHelper import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.SparkFiles @@ -44,8 +44,8 @@ class GGUFWrapperMultiModal(var modelFileName: String, var mmprojFileName: Strin Paths.get(modelFilePath).toFile.exists() && Paths.get(mmprojFilePath).toFile.exists() if (filesExist) { - modelParameters.setModelFilePath(modelFilePath) - modelParameters.setMMProj(mmprojFilePath) + modelParameters.setModel(modelFilePath) +// modelParameters.setMMProj(mmprojFilePath) // TODO: Vision models implementation llamaModel = GGUFWrapperMultiModal.withSafeGGUFModelLoader(modelParameters) } else throw new IllegalStateException( diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppInferenceProperties.scala b/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppInferenceProperties.scala index e200610b38a2a9..fcc797ddbaf417 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppInferenceProperties.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppInferenceProperties.scala @@ -1,8 +1,8 @@ package com.johnsnowlabs.nlp import com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFModel -import com.johnsnowlabs.nlp.llama.InferenceParameters -import com.johnsnowlabs.nlp.llama.args._ +import de.kherud.llama.InferenceParameters +import de.kherud.llama.args._ import com.johnsnowlabs.nlp.serialization.StructFeature import org.apache.spark.ml.param._ diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppModelProperties.scala b/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppModelProperties.scala index e71a7b999f25c2..2c4ddee89320c1 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppModelProperties.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppModelProperties.scala @@ -1,18 +1,13 @@ package com.johnsnowlabs.nlp import com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFModel -import com.johnsnowlabs.nlp.llama.ModelParameters -import com.johnsnowlabs.nlp.llama.args.{GpuSplitMode, NumaStrategy, PoolingType, RopeScalingType} -import com.johnsnowlabs.nlp.serialization.StructFeature +import de.kherud.llama.ModelParameters +import de.kherud.llama.args.{GpuSplitMode, NumaStrategy, PoolingType, RopeScalingType} import org.apache.spark.ml.param._ -import org.apache.spark.sql.SparkSession import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods import org.slf4j.LoggerFactory -import scala.collection.mutable -import scala.jdk.CollectionConverters._ - /** Contains settable model parameters for the [[AutoGGUFModel]]. * * @groupname param Parameters @@ -34,10 +29,10 @@ trait HasLlamaCppModelProperties { new IntParam(this, "nThreads", "Set the number of threads to use during generation") /** @group param */ - val nThreadsDraft = new IntParam( - this, - "nThreadsDraft", - "Set the number of threads to use during draft generation") +// val nThreadsDraft = new IntParam( +// this, +// "nThreadsDraft", +// "Set the number of threads to use during draft generation") /** @group param */ val nThreadsBatch = new IntParam( @@ -46,10 +41,10 @@ trait HasLlamaCppModelProperties { "Set the number of threads to use during batch and prompt processing") /** @group param */ - val nThreadsBatchDraft = new IntParam( - this, - "nThreadsBatchDraft", - "Set the number of threads to use during batch and prompt processing") +// val nThreadsBatchDraft = new IntParam( +// this, +// "nThreadsBatchDraft", +// "Set the number of threads to use during batch and prompt processing") /** @group param */ val nCtx = new IntParam(this, "nCtx", "Set the size of the prompt context") @@ -71,14 +66,14 @@ trait HasLlamaCppModelProperties { new IntParam(this, "nDraft", "Set the number of tokens to draft for speculative decoding") /** @group param */ - val nChunks = new IntParam(this, "nChunks", "Set the maximal number of chunks to process") +// val nChunks = new IntParam(this, "nChunks", "Set the maximal number of chunks to process") /** @group param */ - val nSequences = - new IntParam(this, "nSequences", "Set the number of sequences to decode") +// val nSequences = +// new IntParam(this, "nSequences", "Set the number of sequences to decode") /** @group param */ - val pSplit = new FloatParam(this, "pSplit", "Set the speculative decoding split probability") +// val pSplit = new FloatParam(this, "pSplit", "Set the speculative decoding split probability") /** @group param */ val nGpuLayers = new IntParam( @@ -108,16 +103,16 @@ trait HasLlamaCppModelProperties { new IntParam(this, "mainGpu", "Set the main GPU that is used for scratch and small tensors.") /** @group param */ - val tensorSplit = new DoubleArrayParam( - this, - "tensorSplit", - "Set how split tensors should be distributed across GPUs") +// val tensorSplit = new DoubleArrayParam( +// this, +// "tensorSplit", +// "Set how split tensors should be distributed across GPUs") // TODO /** @group param */ - val grpAttnN = new IntParam(this, "grpAttnN", "Set the group-attention factor") +// val grpAttnN = new IntParam(this, "grpAttnN", "Set the group-attention factor") /** @group param */ - val grpAttnW = new IntParam(this, "grpAttnW", "Set the group-attention width") +// val grpAttnW = new IntParam(this, "grpAttnW", "Set the group-attention width") /** @group param */ val ropeFreqBase = @@ -202,19 +197,19 @@ trait HasLlamaCppModelProperties { new Param[String](this, "modelDraft", "Set the draft model for speculative decoding") /** @group param */ - val lookupCacheStaticFilePath = new Param[String]( - this, - "lookupCacheStaticFilePath", - "Set path to static lookup cache to use for lookup decoding (not updated by generation)") +// val lookupCacheStaticFilePath = new Param[String]( +// this, +// "lookupCacheStaticFilePath", +// "Set path to static lookup cache to use for lookup decoding (not updated by generation)") - /** @group param */ - val lookupCacheDynamicFilePath = new Param[String]( - this, - "lookupCacheDynamicFilePath", - "Set path to dynamic lookup cache to use for lookup decoding (updated by generation)") +// /** @group param */ +// val lookupCacheDynamicFilePath = new Param[String]( +// this, +// "lookupCacheDynamicFilePath", +// "Set path to dynamic lookup cache to use for lookup decoding (updated by generation)") /** @group param */ - val loraAdapters = new StructFeature[Map[String, Float]](this, "loraAdapters") +// val loraAdapters = new StructFeature[Map[String, Float]](this, "loraAdapters") /** @group param */ val embedding = @@ -224,11 +219,11 @@ trait HasLlamaCppModelProperties { val flashAttention = new BooleanParam(this, "flashAttention", "Whether to enable Flash Attention") - /** @group param */ - val inputPrefixBos = new BooleanParam( - this, - "inputPrefixBos", - "Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string") +// /** @group param */ +// val inputPrefixBos = new BooleanParam( +// this, +// "inputPrefixBos", +// "This parameter is deprecated and will have not effect.") /** @group param */ val useMmap = new BooleanParam( @@ -272,9 +267,9 @@ trait HasLlamaCppModelProperties { * * @group setParam */ - def setNThreadsDraft(nThreadsDraft: Int): this.type = { - checkEmbeddingMode { set(this.nThreadsDraft, nThreadsDraft) } - } +// def setNThreadsDraft(nThreadsDraft: Int): this.type = { +// checkEmbeddingMode { set(this.nThreadsDraft, nThreadsDraft) } +// } /** Set the number of threads to use during batch and prompt processing * @@ -288,9 +283,9 @@ trait HasLlamaCppModelProperties { * * @group setParam */ - def setNThreadsBatchDraft(nThreadsBatchDraft: Int): this.type = { - checkEmbeddingMode { set(this.nThreadsBatchDraft, nThreadsBatchDraft) } - } +// def setNThreadsBatchDraft(nThreadsBatchDraft: Int): this.type = { +// checkEmbeddingMode { set(this.nThreadsBatchDraft, nThreadsBatchDraft) } +// } /** Set the size of the prompt context * @@ -328,25 +323,25 @@ trait HasLlamaCppModelProperties { * * @group setParam */ - def setNChunks(nChunks: Int): this.type = { - set(this.nChunks, nChunks) - } +// def setNChunks(nChunks: Int): this.type = { +// set(this.nChunks, nChunks) +// } /** Set the number of sequences to decode * * @group setParam */ - def setNSequences(nSequences: Int): this.type = { - set(this.nSequences, nSequences) - } +// def setNSequences(nSequences: Int): this.type = { +// set(this.nSequences, nSequences) +// } /** Set the speculative decoding split probability * * @group setParam */ - def setPSplit(pSplit: Float): this.type = { - checkEmbeddingMode { set(this.pSplit, pSplit) } - } +// def setPSplit(pSplit: Float): this.type = { +// checkEmbeddingMode { set(this.pSplit, pSplit) } +// } /** Set the number of layers to store in VRAM (-1 - use default) * @@ -387,25 +382,25 @@ trait HasLlamaCppModelProperties { * * @group setParam */ - def setTensorSplit(tensorSplit: Array[Double]): this.type = { - set(this.tensorSplit, tensorSplit) - } +// def setTensorSplit(tensorSplit: Array[Double]): this.type = { +// set(this.tensorSplit, tensorSplit) +// } /** Set the group-attention factor * * @group setParam */ - def setGrpAttnN(grpAttnN: Int): this.type = { - set(this.grpAttnN, grpAttnN) - } +// def setGrpAttnN(grpAttnN: Int): this.type = { +// set(this.grpAttnN, grpAttnN) +// } /** Set the group-attention width * * @group setParam */ - def setGrpAttnW(grpAttnW: Int): this.type = { - set(this.grpAttnW, grpAttnW) - } +// def setGrpAttnW(grpAttnW: Int): this.type = { +// set(this.grpAttnW, grpAttnW) +// } /** Set the RoPE base frequency, used by NTK-aware scaling * @@ -488,38 +483,47 @@ trait HasLlamaCppModelProperties { val numaStrategies = Array("DISABLED", "DISTRIBUTE", "ISOLATE", "NUMA_CTL", "MIRROR") require( numaStrategies.contains(numaUpper), - s"Invalid NUMA strategy: $numa. " + + s"Invalid NUMA strategy: $numaUpper. " + s"Valid values are: ${numaStrategies.mkString(", ")}") set(this.numaStrategy, numaUpper) } /** Set the RoPE frequency scaling method, defaults to linear unless specified by the model. * - * - UNSPECIFIED: Don't use any scaling + * - NONE: Don't use any scaling * - LINEAR: Linear scaling * - YARN: YaRN RoPE scaling * * @group setParam */ def setRopeScalingType(ropeScalingType: String): this.type = { - set(this.ropeScalingType, ropeScalingType) + val ropeUpper = ropeScalingType.toUpperCase + val ropeScalingTypes = Array("NONE", "LINEAR", "YARN") + require( + ropeScalingTypes.contains(ropeUpper), + s"Invalid RoPE scaling type: $ropeUpper. " + + s"Valid values are: ${ropeScalingTypes.mkString(", ")}") + set(this.ropeScalingType, ropeUpper) } - /** Set the pooling type for embeddings, use model default if unspecified + /** Set the pooling type for embeddings, use model default if unspecified. * - * - 0 NONE: Don't use any pooling and return token embeddings (if the model supports it) - * - 1 MEAN: Mean Pooling - * - 2 CLS: Choose the CLS token - * - 3 LAST: Choose the last token + * Possible values: + * + * - NONE: No pooling + * - MEAN: Mean pooling + * - CLS: Choose the CLS token + * - LAST: Choose the last token + * - RANK: For reranking * * @group setParam */ def setPoolingType(poolingType: String): this.type = { val poolingTypeUpper = poolingType.toUpperCase - val poolingTypes = Array("NONE", "MEAN", "CLS", "LAST") + val poolingTypes = Array("NONE", "MEAN", "CLS", "LAST", "RANK") require( poolingTypes.contains(poolingTypeUpper), - s"Invalid pooling type: $poolingType. " + + s"Invalid pooling type: $poolingTypeUpper. " + s"Valid values are: ${poolingTypes.mkString(", ")}") set(this.poolingType, poolingTypeUpper) } @@ -536,34 +540,34 @@ trait HasLlamaCppModelProperties { * * @group setParam */ - def setLookupCacheStaticFilePath(lookupCacheStaticFilePath: String): this.type = { - checkEmbeddingMode { set(this.lookupCacheStaticFilePath, lookupCacheStaticFilePath) } - } - - /** Set path to dynamic lookup cache to use for lookup decoding (updated by generation) - * - * @group setParam - */ - def setLookupCacheDynamicFilePath(lookupCacheDynamicFilePath: String): this.type = { - checkEmbeddingMode { set(this.lookupCacheDynamicFilePath, lookupCacheDynamicFilePath) } - } +// def setLookupCacheStaticFilePath(lookupCacheStaticFilePath: String): this.type = { +// checkEmbeddingMode { set(this.lookupCacheStaticFilePath, lookupCacheStaticFilePath) } +// } +// /** Set path to dynamic lookup cache to use for lookup decoding (updated by generation) +// * +// * @group setParam +// */ +// def setLookupCacheDynamicFilePath(lookupCacheDynamicFilePath: String): this.type = { +// checkEmbeddingMode { set(this.lookupCacheDynamicFilePath, lookupCacheDynamicFilePath) } +// } +// /** Sets paths to lora adapters with user defined scale. * * @group setParam */ - def setLoraAdapters(loraAdapters: Map[String, Float]): this.type = { - set(this.loraAdapters, loraAdapters) - } +// def setLoraAdapters(loraAdapters: Map[String, Float]): this.type = { +// set(this.loraAdapters, loraAdapters) +// } /** Sets paths to lora adapters with user defined scale. (PySpark Override) * * @group setParam */ - def setLoraAdapters(loraAdapters: java.util.HashMap[String, java.lang.Double]): this.type = { - val scalaLoraAdapters = loraAdapters.asScala.map { case (k, v) => k -> v.floatValue() } - set(this.loraAdapters, scalaLoraAdapters.toMap) - } +// def setLoraAdapters(loraAdapters: java.util.HashMap[String, java.lang.Double]): this.type = { +// val scalaLoraAdapters = loraAdapters.asScala.map { case (k, v) => k -> v.floatValue() } +// set(this.loraAdapters, scalaLoraAdapters.toMap) +// } /** Whether to load model with embedding support * @@ -581,13 +585,13 @@ trait HasLlamaCppModelProperties { set(this.flashAttention, flashAttention) } - /** Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string - * - * @group setParam - */ - def setInputPrefixBos(inputPrefixBos: Boolean): this.type = { - set(this.inputPrefixBos, inputPrefixBos) - } +// /** Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string +// * +// * @group setParam +// */ +// def setInputPrefixBos(inputPrefixBos: Boolean): this.type = { +// set(this.inputPrefixBos, inputPrefixBos) +// } /** Whether to use memory-map model (faster load but may increase pageouts if not using mlock) * @@ -633,13 +637,13 @@ trait HasLlamaCppModelProperties { def getNThreads: Int = $(nThreads) /** @group getParam */ - def getNThreadsDraft: Int = $(nThreadsDraft) +// def getNThreadsDraft: Int = $(nThreadsDraft) /** @group getParam */ def getNThreadsBatch: Int = $(nThreadsBatch) /** @group getParam */ - def getNThreadsBatchDraft: Int = $(nThreadsBatchDraft) +// def getNThreadsBatchDraft: Int = $(nThreadsBatchDraft) /** @group getParam */ def getNCtx: Int = $(nCtx) @@ -654,13 +658,13 @@ trait HasLlamaCppModelProperties { def getNDraft: Int = $(nDraft) /** @group getParam */ - def getNChunks: Int = $(nChunks) +// def getNChunks: Int = $(nChunks) /** @group getParam */ - def getNSequences: Int = $(nSequences) +// def getNSequences: Int = $(nSequences) /** @group getParam */ - def getPSplit: Float = $(pSplit) +// def getPSplit: Float = $(pSplit) /** @group getParam */ def getNGpuLayers: Int = $(nGpuLayers) @@ -675,12 +679,12 @@ trait HasLlamaCppModelProperties { def getMainGpu: Int = $(mainGpu) /** @group getParam */ - def getTensorSplit: Array[Double] = $(tensorSplit) +// def getTensorSplit: Array[Double] = $(tensorSplit) - def getGrpAttnN: Int = $(grpAttnN) +// def getGrpAttnN: Int = $(grpAttnN) /** @group getParam */ - def getGrpAttnW: Int = $(grpAttnW) +// def getGrpAttnW: Int = $(grpAttnW) /** @group getParam */ def getRopeFreqBase: Float = $(ropeFreqBase) @@ -719,13 +723,13 @@ trait HasLlamaCppModelProperties { def getModelDraft: String = $(modelDraft) /** @group getParam */ - def getLookupCacheStaticFilePath: String = $(lookupCacheStaticFilePath) +// def getLookupCacheStaticFilePath: String = $(lookupCacheStaticFilePath) /** @group getParam */ - def getLookupCacheDynamicFilePath: String = $(lookupCacheDynamicFilePath) +// def getLookupCacheDynamicFilePath: String = $(lookupCacheDynamicFilePath) /** @group getParam */ - def getLoraAdapters: Map[String, Float] = $$(loraAdapters) +// def getLoraAdapters: Map[String, Float] = $$(loraAdapters) /** @group getParam */ def getEmbedding: Boolean = $(embedding) @@ -733,8 +737,8 @@ trait HasLlamaCppModelProperties { /** @group getParam */ def getFlashAttention: Boolean = $(flashAttention) - /** @group getParam */ - def getInputPrefixBos: Boolean = $(inputPrefixBos) +// /** @group getParam */ +// def getInputPrefixBos: Boolean = $(inputPrefixBos) /** @group getParam */ def getUseMmap: Boolean = $(useMmap) @@ -765,89 +769,90 @@ trait HasLlamaCppModelProperties { */ def getMetadata: String = $(metadata) - def getMetadataMap: Map[String, String] = { + def getMetadataMap: Map[String, Map[String, String]] = { val metadataJsonString = getMetadata if (metadataJsonString.isEmpty) Map.empty else { implicit val formats: DefaultFormats.type = DefaultFormats - JsonMethods.parse(metadataJsonString).extract[Map[String, String]] + JsonMethods.parse(metadataJsonString).extract[Map[String, Map[String, String]]] } } protected def getModelParameters: ModelParameters = { - val modelParameters = new ModelParameters().setContinuousBatching(true) // Always enabled + val modelParameters = new ModelParameters().enableContBatching() // Always enabled + // TODO: rename params? and check which ones are still missing if (isDefined(chatTemplate)) modelParameters.setChatTemplate(getChatTemplate) if (isDefined(defragmentationThreshold)) - modelParameters.setDefragmentationThreshold(getDefragmentationThreshold) - if (isDefined(embedding)) modelParameters.setEmbedding(getEmbedding) - if (isDefined(flashAttention)) modelParameters.setFlashAttention(getFlashAttention) + modelParameters.setDefragThold(getDefragmentationThreshold) + if (isDefined(embedding)) if (getEmbedding) modelParameters.enableEmbedding() + if (isDefined(flashAttention)) if (getFlashAttention) modelParameters.enableFlashAttn() if (isDefined(gpuSplitMode)) modelParameters.setSplitMode(GpuSplitMode.valueOf(getSplitMode)) - if (isDefined(grpAttnN)) modelParameters.setGrpAttnN(getGrpAttnN) - if (isDefined(grpAttnW)) modelParameters.setGrpAttnN(getGrpAttnW) - if (isDefined(inputPrefixBos)) modelParameters.setInputPrefixBos(getInputPrefixBos) - if (isDefined(lookupCacheDynamicFilePath)) - modelParameters.setLookupCacheDynamicFilePath(getLookupCacheDynamicFilePath) - if (isDefined(lookupCacheStaticFilePath)) - modelParameters.setLookupCacheStaticFilePath(getLookupCacheStaticFilePath) +// if (isDefined(grpAttnN)) modelParameters.setGrpAttnN(getGrpAttnN) +// if (isDefined(grpAttnW)) modelParameters.setGrpAttnN(getGrpAttnW) +// if (isDefined(inputPrefixBos)) modelParameters.setInputPrefixBos(getInputPrefixBos) +// if (isDefined(lookupCacheDynamicFilePath)) +// modelParameters.setLookupCacheDynamicFilePath(getLookupCacheDynamicFilePath) +// if (isDefined(lookupCacheStaticFilePath)) +// modelParameters.setLookupCacheStaticFilePath(getLookupCacheStaticFilePath) if (isDefined(mainGpu)) modelParameters.setMainGpu(getMainGpu) if (isDefined(modelDraft)) modelParameters.setModelDraft(getModelDraft) - if (isDefined(nBatch)) modelParameters.setNBatch(getNBatch) - if (isDefined(nChunks)) modelParameters.setNChunks(getNChunks) - if (isDefined(nCtx)) modelParameters.setNCtx(getNCtx) - if (isDefined(nDraft)) modelParameters.setNDraft(getNDraft) - if (isDefined(nGpuLayers)) modelParameters.setNGpuLayers(getNGpuLayers) - if (isDefined(nGpuLayersDraft)) modelParameters.setNGpuLayersDraft(getNGpuLayersDraft) - if (isDefined(nSequences)) modelParameters.setNSequences(getNSequences) - if (isDefined(nThreads)) modelParameters.setNThreads(getNThreads) - if (isDefined(nThreadsBatch)) modelParameters.setNThreadsBatch(getNThreadsBatch) - if (isDefined(nThreadsBatchDraft)) - modelParameters.setNThreadsBatchDraft(getNThreadsBatchDraft) - if (isDefined(nThreadsDraft)) modelParameters.setNThreadsDraft(getNThreadsDraft) - if (isDefined(nUbatch)) modelParameters.setNUbatch(getNUbatch) - if (isDefined(noKvOffload)) modelParameters.setNoKvOffload(getNoKvOffload) - if (isDefined(numaStrategy)) modelParameters.setNuma(NumaStrategy.valueOf(getNuma)) - if (isDefined(pSplit)) modelParameters.setPSplit(getPSplit) + if (isDefined(nBatch)) modelParameters.setBatchSize(getNBatch) +// if (isDefined(nChunks)) modelParameters.setNChunks(getNChunks) + if (isDefined(nCtx)) modelParameters.setCtxSize(getNCtx) + if (isDefined(nDraft)) modelParameters.setCtxSizeDraft(getNDraft) + if (isDefined(nGpuLayers)) modelParameters.setGpuLayers(getNGpuLayers) + if (isDefined(nGpuLayersDraft)) modelParameters.setGpuLayersDraft(getNGpuLayersDraft) +// if (isDefined(nSequences)) modelParameters.setNSequencis(getNSequences) + if (isDefined(nThreads)) modelParameters.setThreads(getNThreads) + if (isDefined(nThreadsBatch)) modelParameters.setThreadsBatch(getNThreadsBatch) +// if (isDefined(nThreadsBatchDraft)) +// modelParameters.setTh(getNThreadsBatchDraft) +// if (isDefined(nThreadsDraft)) modelParameters.setNThreadsDraft(getNThreadsDraft) + if (isDefined(nUbatch)) modelParameters.setUbatchSize(getNUbatch) + if (isDefined(noKvOffload)) if (getNoKvOffload) modelParameters.disableKvOffload() + if (isDefined(numaStrategy)) + modelParameters.setNuma(NumaStrategy.valueOf(getNuma)) +// if (isDefined(pSplit)) modelParameters.setPSplit(getPSplit) if (isDefined(poolingType)) modelParameters.setPoolingType(PoolingType.valueOf(getPoolingType)) if (isDefined(ropeFreqBase)) modelParameters.setRopeFreqBase(getRopeFreqBase) if (isDefined(ropeFreqScale)) modelParameters.setRopeFreqScale(getRopeFreqScale) if (isDefined(ropeScalingType)) - modelParameters.setRopeScalingType(RopeScalingType.valueOf(getRopeScalingType)) - if (isDefined(systemPrompt)) modelParameters.setSystemPrompt(getSystemPrompt) - if (isDefined(tensorSplit)) modelParameters.setTensorSplit(getTensorSplit.map(_.toFloat)) - if (isDefined(useMlock)) modelParameters.setUseMlock(getUseMlock) - if (isDefined(useMmap)) modelParameters.setUseMmap(getUseMmap) + modelParameters.setRopeScaling(RopeScalingType.valueOf(getRopeScalingType)) + // if (isDefined(tensorSplit)) modelParameters.setTensorSplit(getTensorSplit.map(_.toFloat)) + if (isDefined(useMlock)) if (getUseMlock) modelParameters.enableMlock + if (isDefined(useMmap)) if (!getUseMmap) modelParameters.disableMmap if (isDefined(yarnAttnFactor)) modelParameters.setYarnAttnFactor(getYarnAttnFactor) if (isDefined(yarnBetaFast)) modelParameters.setYarnBetaFast(getYarnBetaFast) if (isDefined(yarnBetaSlow)) modelParameters.setYarnBetaSlow(getYarnBetaSlow) if (isDefined(yarnExtFactor)) modelParameters.setYarnExtFactor(getYarnExtFactor) if (isDefined(yarnOrigCtx)) modelParameters.setYarnOrigCtx(getYarnOrigCtx) - if (loraAdapters.isSet) { - val loraAdaptersMap: mutable.Map[String, java.lang.Float] = - mutable.Map(getLoraAdapters.map { case (key, value) => - (key, float2Float(value)) - }.toSeq: _*) - modelParameters.setLoraAdapters(loraAdaptersMap.asJava) - } // Need to convert to mutable map first +// if (loraAdapters.isSet) { +// val loraAdaptersMap: mutable.Map[String, java.lang.Float] = +// mutable.Map(getLoraAdapters.map { case (key, value) => +// (key, float2Float(value)) +// }.toSeq: _*) +// modelParameters.addLoraAdapter(loraAdaptersMap.asJava) +// } // Need to convert to mutable map first modelParameters } // ---------------- GPU SUPPORT ---------------- // Values for automatic GPU support - protected val defaultGpuLayers = 1000 - protected val defaultMainGpu = 0 - - // Entrypoint for models. Automatically set GPU support if detected. - protected def setGpuSupportIfAvailable(spark: SparkSession): this.type = { - val usingGPUJar: Boolean = spark.sparkContext.listJars.exists(_.contains("spark-nlp-gpu")) - if (usingGPUJar) { - logger.info("Using GPU jar. Offloading all layers to GPU.") - setMainGpu(defaultMainGpu) - setNGpuLayers(defaultGpuLayers) - } - this - } +// protected val defaultGpuLayers = 1000 +// protected val defaultMainGpu = 0 +// +// // Entrypoint for models. Automatically set GPU support if detected. +// protected def setGpuSupportIfAvailable(spark: SparkSession): this.type = { +// val usingGPUJar: Boolean = spark.sparkContext.listJars.exists(_.contains("spark-nlp-gpu")) +// if (usingGPUJar) { +// logger.info("Using GPU jar. Offloading all layers to GPU.") +// setMainGpu(defaultMainGpu) +// setNGpuLayers(defaultGpuLayers) +// } +// this +// } } diff --git a/src/main/scala/com/johnsnowlabs/nlp/PromptAssembler.scala b/src/main/scala/com/johnsnowlabs/nlp/PromptAssembler.scala index 3a58132965071d..c2774833c84126 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/PromptAssembler.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/PromptAssembler.scala @@ -1,7 +1,7 @@ package com.johnsnowlabs.nlp import com.johnsnowlabs.nlp.AnnotatorType.DOCUMENT -import com.johnsnowlabs.nlp.llama.LlamaModel +import com.johnsnowlabs.nlp.llama.LlamaExtensions import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} @@ -229,7 +229,7 @@ class PromptAssembler(override val uid: String) Array(role, text) }.toArray - val chatString = LlamaModel.applyChatTemplate(template, chatArray, $(addAssistant)) + val chatString = LlamaExtensions.applyChatTemplate(template, chatArray, $(addAssistant)) Seq(Annotation(chatString)) } catch { case _: Exception => diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala index 4be4c98039058f..970e04c9673188 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala @@ -18,8 +18,9 @@ package com.johnsnowlabs.nlp.annotators.seq2seq import com.johnsnowlabs.ml.gguf.GGUFWrapper import com.johnsnowlabs.ml.util.LlamaCPP import com.johnsnowlabs.nlp._ -import com.johnsnowlabs.nlp.llama.LlamaModel +import com.johnsnowlabs.nlp.llama.LlamaExtensions import com.johnsnowlabs.nlp.util.io.ResourceHelper +import de.kherud.llama.{InferenceParameters, LlamaException, LlamaModel} import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.SparkSession @@ -138,9 +139,7 @@ class AutoGGUFModel(override val uid: String) if (_model.isEmpty) { _model = Some(spark.sparkContext.broadcast(wrapper)) } - - // Entrypoint for models. Automatically set GPU support if detected. - setGpuSupportIfAvailable(spark) + this } private[johnsnowlabs] def setEngine(engineName: String): this.type = set(engine, engineName) @@ -150,8 +149,10 @@ class AutoGGUFModel(override val uid: String) useChatTemplate -> true, nCtx -> 4096, nBatch -> 512, - embedding -> false, - nPredict -> 100) + embedding -> false, // TODO: Disable this? + nPredict -> 100, + nGpuLayers -> 99, + systemPrompt -> "You are a helpful assistant.") /** Sets the number of parallel processes for decoding. This is an alias for `setBatchSize`. * @@ -177,12 +178,13 @@ class AutoGGUFModel(override val uid: String) */ override def batchAnnotate(batchedAnnotations: Seq[Array[Annotation]]): Seq[Seq[Annotation]] = { val annotations: Seq[Annotation] = batchedAnnotations.flatten + // TODO: group by doc and sentence if (annotations.nonEmpty) { - val annotationsText = annotations.map(_.result) + val annotationsText = annotations.map { anno => anno.result } val modelParams = - getModelParameters.setNParallel(getBatchSize) // set parallel decoding to batch size - val inferenceParams = getInferenceParameters + getModelParameters.setParallel(getBatchSize) // set parallel decoding to batch size + val inferenceParams: InferenceParameters = getInferenceParameters val model: LlamaModel = getModelIfNotSet.getSession(modelParams) @@ -190,9 +192,9 @@ class AutoGGUFModel(override val uid: String) // Return embeddings in annotation val (embeddings: Array[Array[Float]], metadata: Map[String, String]) = try { - (model.requestBatchEmbeddings(annotationsText.toArray), Map.empty) + (annotationsText.map(model.embed), Map.empty) } catch { - case e: Exception => + case e: LlamaException => logger.error("Error in llama.cpp embeddings", e) ( Array.fill[Array[Float]](annotationsText.length)(Array.empty), @@ -212,9 +214,12 @@ class AutoGGUFModel(override val uid: String) } else { val (completedTexts: Array[String], metadata: Map[String, String]) = try { - (model.requestBatchCompletion(annotationsText.toArray, inferenceParams), Map.empty) + val results: Array[String] = annotationsText.map { t => + LlamaExtensions.complete(model, inferenceParams, getSystemPrompt, t) + }.toArray + (results, Map.empty) } catch { - case e: Exception => + case e: LlamaException => logger.error("Error in llama.cpp batch completion", e) (Array.fill(annotationsText.length)(""), Map("llamacpp_exception" -> e.getMessage)) } @@ -268,7 +273,7 @@ trait ReadAutoGGUFModel { .setModelIfNotSet(spark, GGUFWrapper.read(spark, localPath)) .setEngine(LlamaCPP.name) - val metadata = LlamaModel.getMetadataFromFile(localPath) + val metadata = LlamaExtensions.getMetadataFromFile(localPath) if (metadata.nonEmpty) annotatorModel.setMetadata(metadata) annotatorModel } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFVisionModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFVisionModel.scala index 62b4d4903ec97b..65e3f8371c65c8 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFVisionModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFVisionModel.scala @@ -18,8 +18,7 @@ package com.johnsnowlabs.nlp.annotators.seq2seq import com.johnsnowlabs.ml.gguf.GGUFWrapperMultiModal import com.johnsnowlabs.ml.util.LlamaCPP import com.johnsnowlabs.nlp._ -import com.johnsnowlabs.nlp.annotators.cv.util.io.ImageIOUtils -import com.johnsnowlabs.nlp.llama.{LlamaException, LlamaModel} +import com.johnsnowlabs.nlp.llama.LlamaExtensions import com.johnsnowlabs.nlp.util.io.ResourceHelper import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.util.Identifiable @@ -158,6 +157,9 @@ class AutoGGUFVisionModel(override val uid: String) with HasLlamaCppInferenceProperties with HasProtectedParams { + throw new NotImplementedError( + "AutoGGUFVisionModel is not implemented yet for this release. Please use the previous Spark NLP release or AutoGGUFModel for text-only tasks.") + override val inputAnnotatorTypes: Array[AnnotatorType] = Array(AnnotatorType.IMAGE, AnnotatorType.DOCUMENT) override val outputAnnotatorType: AnnotatorType = AnnotatorType.DOCUMENT @@ -178,8 +180,6 @@ class AutoGGUFVisionModel(override val uid: String) _model = Some(spark.sparkContext.broadcast(wrapper)) } - // Entrypoint for models. Automatically set GPU support if detected. - setGpuSupportIfAvailable(spark) this } @@ -235,46 +235,47 @@ class AutoGGUFVisionModel(override val uid: String) * sentences that belong to the same original row !! (challenging) */ override def batchAnnotate( - batchedAnnotations: Seq[(Annotation, AnnotationImage)]): Seq[Seq[Annotation]] = { - if (batchedAnnotations.nonEmpty) { - - // set parallel decoding to batch size - val modelParams = getModelParameters.setNParallel(getBatchSize) - val model: LlamaModel = getModelIfNotSet.getSession(modelParams) - - val (prompts, base64EncodedImages) = batchedAnnotations.unzip match { - case (promptAnnotations, imageAnnotations) => - ( - promptAnnotations.map(_.result).toArray, - imageAnnotations - .map(imgAnno => ImageIOUtils.encodeImageBase64(imgAnno.result)) - .toArray) - } - - val (completedTexts: Array[String], metadata: Map[String, String]) = - try { - ( - model.requestBatchImageCompletion( - prompts, - base64EncodedImages, - getInferenceParameters), - Map.empty) - } catch { - case e: LlamaException => - logger.error("Error in llama.cpp image batch completion", e) - (Array.fill(prompts.length)(""), Map("llamacpp_exception" -> e.getMessage)) - } - - val result: Seq[Seq[Annotation]] = - batchedAnnotations.zip(completedTexts).map { - case ((textAnnotation: Annotation, imageAnnotation: AnnotationImage), text) => - val totalMetadata = - textAnnotation.metadata ++ imageAnnotation.metadata ++ metadata - Seq(new Annotation(outputAnnotatorType, 0, text.length - 1, text, totalMetadata)) - } - result - } else Seq(Seq.empty[Annotation]) - } + batchedAnnotations: Seq[(Annotation, AnnotationImage)]): Seq[Seq[Annotation]] = ??? +// { +// if (batchedAnnotations.nonEmpty) { +// +// // set parallel decoding to batch size +// val modelParams = getModelParameters.setParallel(getBatchSize) +// val model: LlamaModel = getModelIfNotSet.getSession(modelParams) +// +// val (prompts, base64EncodedImages) = batchedAnnotations.unzip match { +// case (promptAnnotations, imageAnnotations) => +// ( +// promptAnnotations.map(_.result).toArray, +// imageAnnotations +// .map(imgAnno => ImageIOUtils.encodeImageBase64(imgAnno.result)) +// .toArray) +// } +// +// val (completedTexts: Array[String], metadata: Map[String, String]) = +// try { +// ( +// model.requestBatchImageCompletion( +// prompts, +// base64EncodedImages, +// getInferenceParameters), +// Map.empty) +// } catch { +// case e: LlamaException => +// logger.error("Error in llama.cpp image batch completion", e) +// (Array.fill(prompts.length)(""), Map("llamacpp_exception" -> e.getMessage)) +// } +// +// val result: Seq[Seq[Annotation]] = +// batchedAnnotations.zip(completedTexts).map { +// case ((textAnnotation: Annotation, imageAnnotation: AnnotationImage), text) => +// val totalMetadata = +// textAnnotation.metadata ++ imageAnnotation.metadata ++ metadata +// Seq(new Annotation(outputAnnotatorType, 0, text.length - 1, text, totalMetadata)) +// } +// result +// } else Seq(Seq.empty[Annotation]) +// } } trait ReadablePretrainedAutoGGUFVisionModel @@ -322,7 +323,7 @@ trait ReadAutoGGUFVisionModel { .setEngine(LlamaCPP.name) // TODO mmproj metadata necessary? - val metadata = LlamaModel.getMetadataFromFile(localPathModel) + val metadata = LlamaExtensions.getMetadataFromFile(localPathModel) if (metadata.nonEmpty) annotatorModel.setMetadata(metadata) annotatorModel } diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddings.scala index 389166a7ad10f6..06ae1053f24a8d 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddings.scala @@ -18,7 +18,8 @@ package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.ml.gguf.GGUFWrapper import com.johnsnowlabs.ml.util.LlamaCPP import com.johnsnowlabs.nlp._ -import com.johnsnowlabs.nlp.llama.LlamaModel +import com.johnsnowlabs.nlp.llama.LlamaExtensions +import de.kherud.llama.LlamaModel import com.johnsnowlabs.nlp.util.io.ResourceHelper import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.util.Identifiable @@ -130,7 +131,8 @@ class AutoGGUFEmbeddings(override val uid: String) _model = Some(spark.sparkContext.broadcast(wrapper)) } - setGpuSupportIfAvailable(spark) + this +// setGpuSupportIfAvailable(spark) } private[johnsnowlabs] def setEngine(engineName: String): this.type = set(engine, engineName) @@ -140,7 +142,8 @@ class AutoGGUFEmbeddings(override val uid: String) embedding -> true, poolingType -> "MEAN", nCtx -> 4096, - nBatch -> 512) + nBatch -> 512, + nGpuLayers -> 99) /** Sets the number of parallel processes for decoding. This is an alias for `setBatchSize`. * @@ -172,7 +175,7 @@ class AutoGGUFEmbeddings(override val uid: String) if (annotations.nonEmpty) { val modelParams = - getModelParameters.setNParallel(getBatchSize) // set parallel decoding to batch size + getModelParameters.setParallel(getBatchSize) // set parallel decoding to batch size val model: LlamaModel = getModelIfNotSet.getSession(modelParams) @@ -181,7 +184,8 @@ class AutoGGUFEmbeddings(override val uid: String) // Return embeddings in annotation val (embeddings: Array[Array[Float]], metadata: Map[String, String]) = try { - (model.requestBatchEmbeddings(annotationsText.toArray), Map.empty) + val result: Array[Array[Float]] = annotationsText.map(model.embed).toArray + (result, Map.empty) } catch { case e: Exception => logger.error("Error in llama.cpp embeddings", e) @@ -241,7 +245,7 @@ trait ReadAutoGGUFEmbeddings { .setModelIfNotSet(spark, GGUFWrapper.read(spark, localPath)) .setEngine(LlamaCPP.name) - val metadata = LlamaModel.getMetadataFromFile(localPath) + val metadata = LlamaExtensions.getMetadataFromFile(localPath) if (metadata.nonEmpty) annotatorModel.setMetadata(metadata) annotatorModel } diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala index 01cb289903550d..96be045eef66a3 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala @@ -87,23 +87,23 @@ class AutoGGUFModelTest extends AnyFlatSpec { it should "accept all parameters that are settable" taggedAs SlowTest in { // Model Parameters model.setNThreads(8) - model.setNThreadsDraft(8) +// model.setNThreadsDraft(8) model.setNThreadsBatch(8) - model.setNThreadsBatchDraft(8) +// model.setNThreadsBatchDraft(8) model.setNCtx(512) model.setNBatch(32) model.setNUbatch(32) model.setNDraft(5) - model.setNChunks(-1) - model.setNSequences(1) - model.setPSplit(0.1f) +// model.setNChunks(-1) +// model.setNSequences(1) +// model.setPSplit(0.1f) model.setNGpuLayers(99) model.setNGpuLayersDraft(99) model.setGpuSplitMode("NONE") model.setMainGpu(0) - model.setTensorSplit(Array[Double]()) - model.setGrpAttnN(1) - model.setGrpAttnW(512) +// model.setTensorSplit(Array[Double]()) +// model.setGrpAttnN(1) +// model.setGrpAttnW(512) model.setRopeFreqBase(1.0f) model.setRopeFreqScale(1.0f) model.setYarnExtFactor(1.0f) @@ -113,14 +113,14 @@ class AutoGGUFModelTest extends AnyFlatSpec { model.setYarnOrigCtx(0) model.setDefragmentationThreshold(-1.0f) model.setNumaStrategy("DISTRIBUTE") - model.setRopeScalingType("UNSPECIFIED") - model.setPoolingType("UNSPECIFIED") + model.setRopeScalingType("NONE") + model.setPoolingType("NONE") model.setModelDraft("") - model.setLookupCacheStaticFilePath("/tmp/sparknlp-llama-cpp-cache") - model.setLookupCacheDynamicFilePath("/tmp/sparknlp-llama-cpp-cache") +// model.setLookupCacheStaticFilePath("/tmp/sparknlp-llama-cpp-cache") +// model.setLookupCacheDynamicFilePath("/tmp/sparknlp-llama-cpp-cache") model.setEmbedding(false) model.setFlashAttention(false) - model.setInputPrefixBos(false) +// model.setInputPrefixBos(false) model.setUseMmap(false) model.setUseMlock(false) model.setNoKvOffload(false) @@ -130,7 +130,7 @@ class AutoGGUFModelTest extends AnyFlatSpec { // Inference Parameters model.setInputPrefix("") model.setInputSuffix("") - model.setCachePrompt(false) + model.setCachePrompt(true) model.setNPredict(-1) model.setTopK(40) model.setTopP(0.9f) @@ -164,7 +164,7 @@ class AutoGGUFModelTest extends AnyFlatSpec { // Struct Features model.setTokenIdBias(Map(0 -> 0.0f, 1 -> 0.0f)) model.setTokenBias(Map("!" -> 0.0f, "?" -> 0.0f)) - model.setLoraAdapters(Map(" " -> 0.0f)) +// model.setLoraAdapters(Map(" " -> 0.0f)) lazy val pipeline = new Pipeline().setStages(Array(documentAssembler, model)) diff --git a/src/test/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddingsTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddingsTestSpec.scala index f9a90635d6ac2d..0c05df61fa7c39 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddingsTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddingsTestSpec.scala @@ -41,7 +41,8 @@ class AutoGGUFEmbeddingsTestSpec extends AnyFlatSpec { .setBatchSize(4) .setPoolingType(poolingType) .setNCtx(8192) - def pipeline(embedModel: AutoGGUFEmbeddings = model("MEAN")) = + + def pipeline(embedModel: AutoGGUFEmbeddings = model("MEAN")): Pipeline = new Pipeline().setStages(Array(documentAssembler, embedModel)) it should "produce embeddings" taggedAs SlowTest in { @@ -110,8 +111,8 @@ class AutoGGUFEmbeddingsTestSpec extends AnyFlatSpec { it should "embed long text" taggedAs SlowTest in { val result = pipeline( model("MEAN") - .setNUbatch(2048) - .setNBatch(2048)).fit(longData).transform(longData) + .setNUbatch(4096) + .setNBatch(4096)).fit(longData).transform(longData) val collected = Annotation.collect(result, "embeddings") assert(collected.length == longDataCopies, "Should return the same number of rows")