Xaenalt
diff --git a/‎Dockerfile‎
Lines changed: 80 additions & 91 deletions b/‎Dockerfile‎
Lines changed: 80 additions & 91 deletions
diff --git a/‎integration_tests/sample_client.py‎
Lines changed: 85 additions & 0 deletions b/‎integration_tests/sample_client.py‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎router/src/batcher.rs‎
Lines changed: 13 additions & 2 deletions b/‎router/src/batcher.rs‎
Lines changed: 13 additions & 2 deletions
@@ -1,14 +1,15 @@
 ## Global Args #################################################################
-ARG BASE_UBI_IMAGE_TAG=9.4-1181
-ARG PROTOC_VERSION=25.2
+ARG BASE_UBI_IMAGE_TAG=latest
+ARG PROTOC_VERSION=25.3
 ARG PYTORCH_INDEX="https://download.pytorch.org/whl"
 # ARG PYTORCH_INDEX="https://download.pytorch.org/whl/nightly"
+ARG AUTO_GPTQ_VERSION=0.7.1
 
 # match PyTorch version that was used to compile flash-attention v2 pre-built wheels
 # e.g. flash-attn v2.5.2 => torch ['1.12.1', '1.13.1', '2.0.1', '2.1.2', '2.2.0', '2.3.0.dev20240126']
 # https://github.com/Dao-AILab/flash-attention/blob/v2.5.2/.github/workflows/publish.yml#L47
 # use nightly build index for torch .dev pre-release versions
-ARG PYTORCH_VERSION=2.2.0
+ARG PYTORCH_VERSION=2.2.1
 
 ARG PYTHON_VERSION=3.11
 
@@ -35,18 +36,19 @@ ENV LANG=C.UTF-8 \
 ## CUDA Base ###################################################################
 FROM base as cuda-base
 
-ENV CUDA_VERSION=11.8.0 \
-    NV_CUDA_LIB_VERSION=11.8.0-1 \
+# Ref: https://docs.nvidia.com/cuda/archive/12.1.0/cuda-toolkit-release-notes/
+ENV CUDA_VERSION=12.1.0 \
+    NV_CUDA_LIB_VERSION=12.1.0-1 \
     NVIDIA_VISIBLE_DEVICES=all \
     NVIDIA_DRIVER_CAPABILITIES=compute,utility \
-    NV_CUDA_CUDART_VERSION=11.8.89-1 \
-    NV_CUDA_COMPAT_VERSION=520.61.05-1
+    NV_CUDA_CUDART_VERSION=12.1.55-1 \
+    NV_CUDA_COMPAT_VERSION=530.30.02-1
 
 RUN dnf config-manager \
        --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
     && dnf install -y \
-        cuda-cudart-11-8-${NV_CUDA_CUDART_VERSION} \
-        cuda-compat-11-8-${NV_CUDA_COMPAT_VERSION} \
+        cuda-cudart-12-1-${NV_CUDA_CUDART_VERSION} \
+        cuda-compat-12-1-${NV_CUDA_COMPAT_VERSION} \
     && echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \
     && echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf \
     && dnf clean all
@@ -56,53 +58,35 @@ ENV CUDA_HOME="/usr/local/cuda" \
     LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"
 
 
-## CUDA Runtime ################################################################
-FROM cuda-base as cuda-runtime
-
-ENV NV_NVTX_VERSION=11.8.86-1 \
-    NV_LIBNPP_VERSION=11.8.0.86-1 \
-    NV_LIBCUBLAS_VERSION=11.11.3.6-1 \
-    NV_LIBNCCL_PACKAGE_VERSION=2.15.5-1+cuda11.8
-
-RUN dnf config-manager \
-       --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
-    && dnf install -y \
-        cuda-libraries-11-8-${NV_CUDA_LIB_VERSION} \
-        cuda-nvtx-11-8-${NV_NVTX_VERSION} \
-        libnpp-11-8-${NV_LIBNPP_VERSION} \
-        libcublas-11-8-${NV_LIBCUBLAS_VERSION} \
-        libnccl-${NV_LIBNCCL_PACKAGE_VERSION} \
-    && dnf clean all
-
-
 ## CUDA Development ############################################################
 FROM cuda-base as cuda-devel
 
-ENV NV_CUDA_CUDART_DEV_VERSION=11.8.89-1 \
-    NV_NVML_DEV_VERSION=11.8.86-1 \
-    NV_LIBCUBLAS_DEV_VERSION=11.11.3.6-1 \
-    NV_LIBNPP_DEV_VERSION=11.8.0.86-1 \
-    NV_LIBNCCL_DEV_PACKAGE_VERSION=2.15.5-1+cuda11.8
+# Ref: https://developer.nvidia.com/nccl/nccl-legacy-downloads
+ENV NV_CUDA_CUDART_DEV_VERSION=12.1.55-1 \
+    NV_NVML_DEV_VERSION=12.1.55-1 \
+    NV_LIBCUBLAS_DEV_VERSION=12.1.0.26-1 \
+    NV_LIBNPP_DEV_VERSION=12.0.2.50-1 \
+    NV_LIBNCCL_DEV_PACKAGE_VERSION=2.18.3-1+cuda12.1
 
 RUN dnf config-manager \
        --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
     && dnf install -y \
-        cuda-command-line-tools-11-8-${NV_CUDA_LIB_VERSION} \
-        cuda-libraries-devel-11-8-${NV_CUDA_LIB_VERSION} \
-        cuda-minimal-build-11-8-${NV_CUDA_LIB_VERSION} \
-        cuda-cudart-devel-11-8-${NV_CUDA_CUDART_DEV_VERSION} \
-        cuda-nvml-devel-11-8-${NV_NVML_DEV_VERSION} \
-        libcublas-devel-11-8-${NV_LIBCUBLAS_DEV_VERSION} \
-        libnpp-devel-11-8-${NV_LIBNPP_DEV_VERSION} \
+        cuda-command-line-tools-12-1-${NV_CUDA_LIB_VERSION} \
+        cuda-libraries-devel-12-1-${NV_CUDA_LIB_VERSION} \
+        cuda-minimal-build-12-1-${NV_CUDA_LIB_VERSION} \
+        cuda-cudart-devel-12-1-${NV_CUDA_CUDART_DEV_VERSION} \
+        cuda-nvml-devel-12-1-${NV_NVML_DEV_VERSION} \
+        libcublas-devel-12-1-${NV_LIBCUBLAS_DEV_VERSION} \
+        libnpp-devel-12-1-${NV_LIBNPP_DEV_VERSION} \
         libnccl-devel-${NV_LIBNCCL_DEV_PACKAGE_VERSION} \
     && dnf clean all
 
 ENV LIBRARY_PATH="$CUDA_HOME/lib64/stubs"
 
 
 ## Rust builder ################################################################
-# Specific debian version so that compatible glibc version is used
-FROM rust:1.76-bullseye as rust-builder
+# Using bookworm for compilation so the rust binaries get linked against libssl.so.3
+FROM rust:1.78-bookworm as rust-builder
 ARG PROTOC_VERSION
 
 ENV CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
@@ -180,6 +164,9 @@ RUN cd server && \
     make gen-server && \
     pip install ".[accelerate]" --no-cache-dir
 
+# temp: install newer transformers lib that optimum clashes with
+RUN pip install transformers==4.40.0 tokenizers==0.19.1 --no-cache-dir
+
 # Patch codegen model changes into transformers
 RUN cp server/transformers_patch/modeling_codegen.py ${SITE_PACKAGES}/transformers/models/codegen/modeling_codegen.py
 
@@ -218,12 +205,12 @@ ENV PATH=/opt/tgis/bin/:$PATH
 # Install specific version of torch
 RUN pip install ninja==1.11.1.1 --no-cache-dir
 RUN pip install packaging --no-cache-dir
-RUN pip install torch==$PYTORCH_VERSION+cu118 --index-url "${PYTORCH_INDEX}/cu118" --no-cache-dir
+RUN pip install torch==$PYTORCH_VERSION+cu121 --index-url "${PYTORCH_INDEX}/cu121" --no-cache-dir
 
 
 ## Build flash attention v2 ####################################################
 FROM python-builder as flash-att-v2-builder
-ARG FLASH_ATT_VERSION=v2.5.2
+ARG FLASH_ATT_VERSION=v2.5.6
 
 WORKDIR /usr/src/flash-attention-v2
 
@@ -237,14 +224,15 @@ RUN MAX_JOBS=2  pip --verbose wheel --no-deps flash-attn==${FLASH_ATT_VERSION} \
 
 
 ## Install auto-gptq ###########################################################
-FROM python-builder as auto-gptq-installer
-ARG AUTO_GPTQ_REF=ccb6386ebfde63c17c45807d38779a93cd25846f
-
-WORKDIR /usr/src/auto-gptq-wheel
-
-# numpy is required to run auto-gptq's setup.py
-RUN pip install numpy
-RUN DISABLE_QIGEN=1 pip wheel git+https://github.com/AutoGPTQ/AutoGPTQ@${AUTO_GPTQ_REF} --no-cache-dir --no-deps --verbose
+## Uncomment if a custom autogptq build is required
+#FROM python-builder as auto-gptq-installer
+#ARG AUTO_GPTQ_REF=896d8204bc89a7cfbda42bf3314e13cf4ce20b02
+#
+#WORKDIR /usr/src/auto-gptq-wheel
+#
+## numpy is required to run auto-gptq's setup.py
+#RUN pip install numpy
+#RUN DISABLE_QIGEN=1 pip wheel git+https://github.com/AutoGPTQ/AutoGPTQ@${AUTO_GPTQ_REF} --no-cache-dir --no-deps --verbose
 
 ## Build libraries #############################################################
 FROM python-builder as build
@@ -254,75 +242,76 @@ COPY server/custom_kernels/ /usr/src/.
 RUN cd /usr/src && python setup.py build_ext && python setup.py install
 
 
-## Build transformers exllama kernels ##########################################
-FROM python-builder as exllama-kernels-builder
-
-WORKDIR /usr/src
-
-COPY server/exllama_kernels/ .
-RUN python setup.py build
-
-
-## Build transformers exllamav2 kernels ########################################
-FROM python-builder as exllamav2-kernels-builder
-
-WORKDIR /usr/src
-
-COPY server/exllamav2_kernels/ .
-RUN python setup.py build
-
-
 ## Flash attention v2 cached build image #######################################
 FROM base as flash-att-v2-cache
 
 # Copy just the wheels we built for flash-attention
 COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2 /usr/src/flash-attention-v2
 
 
-## Auto gptq cached build image
-FROM base as auto-gptq-cache
+## Auto gptq cached build image ################################################
+## Uncomment if a custom autogptq build is required
+#FROM base as auto-gptq-cache
+#
+## Copy just the wheel we built for auto-gptq
+#COPY --from=auto-gptq-installer /usr/src/auto-gptq-wheel /usr/src/auto-gptq-wheel
 
-# Copy just the wheel we built for auto-gptq
-COPY --from=auto-gptq-installer /usr/src/auto-gptq-wheel /usr/src/auto-gptq-wheel
 
+## Full set of python installations for server release #########################
+
+FROM python-builder as python-installations
 
-## Final Inference Server image ################################################
-FROM cuda-runtime as server-release
 ARG PYTHON_VERSION
+ARG AUTO_GPTQ_VERSION
 ARG SITE_PACKAGES=/opt/tgis/lib/python${PYTHON_VERSION}/site-packages
 
-# Install C++ compiler (required at runtime when PT2_COMPILE is enabled)
-RUN dnf install -y gcc-c++ git && dnf clean all \
-    && useradd -u 2000 tgis -m -g 0
-
-SHELL ["/bin/bash", "-c"]
-
 COPY --from=build /opt/tgis /opt/tgis
 
+# `pip` is installed in the venv here
 ENV PATH=/opt/tgis/bin:$PATH
 
 # Install flash attention v2 from the cache build
 RUN --mount=type=bind,from=flash-att-v2-cache,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
     pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
 
-# Copy build artifacts from exllama kernels builder
-COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES}
-
-# Copy build artifacts from exllamav2 kernels builder
-COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES}
-
 # Copy over the auto-gptq wheel and install it
-RUN --mount=type=bind,from=auto-gptq-cache,src=/usr/src/auto-gptq-wheel,target=/usr/src/auto-gptq-wheel \
-    pip install /usr/src/auto-gptq-wheel/*.whl --no-cache-dir
+#RUN --mount=type=bind,from=auto-gptq-cache,src=/usr/src/auto-gptq-wheel,target=/usr/src/auto-gptq-wheel \
+#    pip install /usr/src/auto-gptq-wheel/*.whl --no-cache-dir
+
+# We only need to install a custom-built auto-gptq version if we need a pre-release
+# or are using a PyTorch nightly version
+RUN pip install auto-gptq=="${AUTO_GPTQ_VERSION}" --no-cache-dir
 
 # Install server
+# git is required to pull the fms-extras dependency
+RUN dnf install -y git && dnf clean all
 COPY proto proto
 COPY server server
-RUN cd server && make gen-server && pip install ".[accelerate, ibm-fms, onnx-gpu, quantize]" --no-cache-dir
+# Extra url is required to install cuda-12 version of onnxruntime-gpu
+# Ref: https://onnxruntime.ai/docs/install/#install-onnx-runtime-gpu-cuda-12x
+RUN cd server && make gen-server && pip install ".[accelerate, ibm-fms, onnx-gpu, quantize]" --no-cache-dir --extra-index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
+
+# temp: install newer transformers lib that optimum clashes with
+RUN pip install transformers==4.40.0 tokenizers==0.19.1 --no-cache-dir
 
 # Patch codegen model changes into transformers 4.35
 RUN cp server/transformers_patch/modeling_codegen.py ${SITE_PACKAGES}/transformers/models/codegen/modeling_codegen.py
 
+
+## Final Inference Server image ################################################
+FROM base as server-release
+ARG PYTHON_VERSION
+ARG SITE_PACKAGES=/opt/tgis/lib/python${PYTHON_VERSION}/site-packages
+
+# Install C++ compiler (required at runtime when PT2_COMPILE is enabled)
+RUN dnf install -y gcc-c++ && dnf clean all \
+    && useradd -u 2000 tgis -m -g 0
+
+# Copy in the full python environment
+COPY --from=python-installations /opt/tgis /opt/tgis
+
+ENV PATH=/opt/tgis/bin:$PATH
+
 # Print a list of all installed packages and versions
 RUN pip list -v --disable-pip-version-check --no-python-version-warning
 
 
@@ -0,0 +1,85 @@
+import time
+import grpc
+from google.protobuf import json_format
+from text_generation_tests.pb import generation_pb2_grpc as gpb2, generation_pb2 as pb2
+
+
+def get_streaming_response_tgis(response):
+    stop = False
+    generated_tokens = 0
+    while not stop:
+        try:
+            x = next(response)
+            timestamp = time.time_ns()
+            data = json_format.MessageToDict(x)
+            # skip first response (tokenizer output only)
+            if "inputTokenCount" not in data:
+                n_tokens = data["generatedTokenCount"] - generated_tokens
+                generated_tokens = data["generatedTokenCount"]
+                yield data, n_tokens, timestamp, True, None
+        except Exception as e:
+            timestamp = time.time_ns()
+            yield None, 0, timestamp, False, e
+
+
+channel = grpc.insecure_channel("localhost:8033")
+stub = gpb2.GenerationServiceStub(channel)
+max_new_tokens = 100
+
+template = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{}\n\n### Response:"
+num_req = 0
+while True:
+    prompt_input = input(f"\n{num_req}) Enter a prompt:\n")
+
+    print("-" * 40)
+    print("Output:")
+    prompt = template.format(prompt_input)
+    sample_request = {
+        "model_id": "dummy-model-name",
+        "request": {"text": prompt},
+        "params": {
+            "method": "GREEDY",
+            "stopping": {
+                "max_new_tokens": max_new_tokens,
+                "min_new_tokens": max_new_tokens,
+            },
+        },
+    }
+    message = json_format.ParseDict(sample_request, pb2.SingleGenerationRequest())
+    output = []
+    total_time = 0
+    response = stub.GenerateStream(message)
+    response_generator = get_streaming_response_tgis(response)
+    t0 = time.time_ns()
+    response = ""
+    stop = False
+    while not stop:
+        r, n_tokens, t, ok, err = next(response_generator)
+
+        if not ok:
+            stop = True
+            # check if we have reached end of stream
+            if type(err) is StopIteration:
+                continue
+        duration = (t - t0) / 1000.0 / 1000.0
+        record = {
+            "response": r,
+            "ok": ok,
+            "error": str(err),
+            "timestamp": t,
+            "duration_ms": duration,
+            "n_tokens": n_tokens,
+        }
+        total_time += duration
+        response += r["text"]
+        output.append(record)
+        t0 = t
+
+    # print(json.dumps(output, indent=4))
+    print("-" * 40)
+    print(response)
+    print("-" * 40)
+    print(f"Total_time : {total_time}ms")
+    print(f"Time_per_token : {total_time/max_new_tokens}ms")
+    print("-" * 40)
+    num_req += 1
@@ -723,8 +723,19 @@ impl<'a> TokenProcessor<'a> {
             let request_id = output.request_id;
             let next_token_id = output.token_id;
 
-            let e = self.entries.get_mut(&request_id)
-                .expect("ID not found. This is a bug.");
+            let e = self.entries.get_mut(&request_id);
+
+            // if a client cancelled a request and speculative decoding is 
+            // enabled, it's possible that the request will get removed
+            // from entries table, but there can still be tokens in outputs stream 
+            // corresponding to that request. ideally we could defer removing
+            // the request_id from the entries table until all tokens have been 
+            // processed...but for now let's just ignore them.
+            if e.is_none() {
+                continue;
+            }
+
+            let e = e.unwrap();
 
             let is_stream = e.stream_tx.is_some();
             let stop_seqs = &e.request.parameters.stop_seqs;