From 253a252e96646c0785eb8d156ec4bd3085f4dcb8 Mon Sep 17 00:00:00 2001
From: Ajay Saini <ajay@mosaicml.com>
Date: Wed, 21 Jun 2023 13:22:49 -0700
Subject: [PATCH 01/12] Hosted handler for MPT

---
 .../mpt/mpt_hosted_handler.py                 | 424 ++++++++++++++++++
 1 file changed, 424 insertions(+)
 create mode 100644 examples/inference-deployments/mpt/mpt_hosted_handler.py

diff --git a/examples/inference-deployments/mpt/mpt_hosted_handler.py b/examples/inference-deployments/mpt/mpt_hosted_handler.py
new file mode 100644
index 000000000..8866dd7c8
--- /dev/null
+++ b/examples/inference-deployments/mpt/mpt_hosted_handler.py
@@ -0,0 +1,424 @@
+# Copyright 2022 MosaicML Examples authors
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import configparser
+import copy
+import os
+from typing import Dict, List, Tuple, Optional
+from pathlib import Path
+from urllib.parse import urlparse
+
+import boto3
+import botocore
+import torch
+import torch.distributed as dist
+from FasterTransformer.examples.pytorch.gpt.utils.parallel_gpt import ParallelGPT  # yapf: disable # type: ignore
+from FasterTransformer.examples.pytorch.gpt.utils import comm  # yapf: disable # type: ignore
+from scripts.inference.convert_hf_mpt_to_ft import convert_mpt_to_ft  # yapf: disable # type: ignore
+from torch.nn.utils.rnn import pad_sequence
+from transformers import AutoTokenizer
+from huggingface_hub import snapshot_download
+
+LOCAL_CHECKPOINT_DIR = '/tmp/mpt'
+LOCAL_MODEL_PATH = os.path.join(LOCAL_CHECKPOINT_DIR, 'local_model')
+
+
+def download_convert(s3_path: Optional[str] = None,
+                     hf_path: Optional[str] = None,
+                     gpus: int = 1,
+                     force_conversion: bool = False):
+    """Download model and convert to FasterTransformer format.
+
+    Args:
+        s3_path (str): Path for model location in an s3 bucket.
+        hf_path (str): Name of the model as on HF hub (e.g., mosaicml/mpt-7b-instruct) or local folder name containing
+            the model (e.g., mpt-7b-instruct)
+        gpus (int): Number of gpus to use for inference (Default: 1)
+        force_conversion (bool): Force conversion to FT even if some features may not work as expected in FT (Default: False)
+    """
+    if not s3_path and not hf_path:
+        raise RuntimeError(
+            'Either s3_path or hf_path must be provided to download_convert')
+    model_name_or_path: str = ''
+    if s3_path:
+        # s3 creds need to already be present as env vars
+        s3 = boto3.client('s3')
+        model_name_or_path = LOCAL_MODEL_PATH
+
+        # Download model files
+        if os.path.exists(LOCAL_MODEL_PATH):
+            print(
+                f'[+] Path {LOCAL_MODEL_PATH} already exists, skipping download'
+            )
+        else:
+            Path(LOCAL_MODEL_PATH).mkdir(parents=True, exist_ok=True)
+
+            print(f'Downloading model from path: {s3_path}')
+
+            parsed_path = urlparse(s3_path)
+
+            objs = s3.list_objects_v2(
+                Bucket=parsed_path.netloc,
+                Prefix=parsed_path.path.lstrip('/'),
+            )
+            for obj in objs['Contents']:
+                file_key = obj['Key']
+                try:
+                    file_name = os.path.basename(file_key)
+                    s3.download_file(Bucket=parsed_path.netloc,
+                                     Key=file_key,
+                                     Filename=os.path.join(
+                                         LOCAL_MODEL_PATH, file_name))
+                except botocore.exceptions.ClientError as e:
+                    print(
+                        f'Error downloading file with key: {file_key} with error: {e}'
+                    )
+    elif hf_path:
+        print(f'Downloading HF model with name: {hf_path}')
+        model_name_or_path = hf_path
+        snapshot_download(repo_id=hf_path)
+
+    # This is the format the the conversion script saves the converted checkpoint in
+    local_ft_model_path = os.path.join(LOCAL_CHECKPOINT_DIR, f'{gpus}-gpu')
+    ckpt_config_path = os.path.join(local_ft_model_path, 'config.ini')
+
+    # Convert model to FT format
+    # If FT checkpoint doesn't exist, create it.
+    if not os.path.isfile(ckpt_config_path):
+        print('Converting model to FT format')
+        # Datatype of weights in the HF checkpoint
+        weight_data_type = 'fp32'
+        convert_mpt_to_ft(model_name_or_path, LOCAL_CHECKPOINT_DIR, gpus,
+                          weight_data_type, force_conversion)
+        if not os.path.isfile(ckpt_config_path):
+            raise RuntimeError('Failed to create FT checkpoint')
+    else:
+        print(f'Reusing existing FT checkpoint at {local_ft_model_path}')
+
+
+class MPTFTHostedModelHandler:
+    # This is what the user request will contain
+    INPUT_GENERATE_KWARGS = {
+        'max_new_tokens': 256,
+        'top_p': 0.95,
+        'top_k': 50,
+        'temperature': 0.8,
+    }
+
+    # These are the args we need to map the user request to before running generate()
+    # with FasterTransformer
+    FT_GENERATE_KWARGS = {
+        # Output sequence length to generate.
+        'output_len': 256,
+        # Beam width for beam search
+        'beam_width': 1,
+        # top k candidate number
+        'top_k': 50,
+        # top p probability threshold
+        'top_p': 0.95,
+        # temperature parameter
+        'temperature': 0.8,
+        # Penalty for repetitions
+        'repetition_penalty': 1.0,
+        # Presence penalty. Similar to repetition, but additive rather than multiplicative.
+        'presence_penalty': 0.0,
+        'beam_search_diversity_rate': 0.0,
+        'len_penalty': 0.0,
+        'bad_words_list': None,
+        # A minimum number of tokens to generate.
+        'min_length': 0,
+        # if True, use different random seed for sentences in a batch.
+        'random_seed': True
+    }
+
+    INPUT_KEY = 'inputs'
+    PARAMETERS_KEY = 'parameters'
+
+    def __init__(self,
+                 model_name_or_path: str,
+                 ft_lib_path: str,
+                 inference_data_type: str = 'bf16',
+                 int8_mode: int = 0,
+                 gpus: int = 1):
+        """Fastertransformer model handler for MPT foundation series.
+
+        Args:
+            model_name_or_path (str): Name of the model as on HF hub (e.g., mosaicml/mpt-7b-instruct) or local model name (e.g., mpt-7b-instruct)
+            ft_lib_path (str): Path to the libth_transformer dynamic lib file(.e.g., build/lib/libth_transformer.so).
+            inference_data_type (str): Data type to use for inference (Default: bf16)
+            int8_mode (int): The level of quantization to perform. 0: No quantization. All computation in data_type,
+                1: Quantize weights to int8, all compute occurs in fp16/bf16. Not supported when data_type is fp32
+            gpus (int): Number of gpus to use for inference (Default: 1)
+        """
+        self.model_name_or_path = model_name_or_path
+
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path,
+                                                       trust_remote_code=True)
+
+        # Make sure the seed on all ranks is the same. This is important.
+        # Multi-gpu generate calls will hang without this.
+        torch.manual_seed(0)
+
+        model_path = os.path.join(LOCAL_CHECKPOINT_DIR, f'{gpus}-gpu')
+        ckpt_config_path = os.path.join(model_path, 'config.ini')
+
+        ckpt_config = configparser.ConfigParser()
+        ckpt_config.read(ckpt_config_path)
+
+        # Disable this optimization.
+        # https://github.com/NVIDIA/FasterTransformer/blob/main/docs/gpt_guide.md#advanced-features
+        shared_contexts_ratio = 0.0
+
+        if 'gpt' in ckpt_config.keys():
+            head_num = ckpt_config.getint('gpt', 'head_num')
+            size_per_head = ckpt_config.getint('gpt', 'size_per_head')
+            vocab_size = ckpt_config.getint('gpt', 'vocab_size')
+            start_id = ckpt_config.getint('gpt', 'start_id')
+            end_id = ckpt_config.getint('gpt', 'end_id')
+            layer_num = ckpt_config.getint('gpt', 'num_layer')
+            max_seq_len = ckpt_config.getint('gpt', 'max_pos_seq_len')
+            weights_data_type = ckpt_config.get('gpt', 'weight_data_type')
+            tensor_para_size = ckpt_config.getint('gpt', 'tensor_para_size')
+            pipeline_para_size = ckpt_config.getint('gpt',
+                                                    'pipeline_para_size',
+                                                    fallback=1)
+            layernorm_eps = ckpt_config.getfloat('gpt',
+                                                 'layernorm_eps',
+                                                 fallback=1e-5)
+            use_attention_linear_bias = ckpt_config.getboolean(
+                'gpt', 'use_attention_linear_bias')
+            has_positional_encoding = ckpt_config.getboolean(
+                'gpt', 'has_positional_encoding')
+        else:
+            raise RuntimeError(
+                'Unexpected config.ini for the FT checkpoint. Expected FT checkpoint to contain the `gpt` key.'
+            )
+
+        self.end_id = end_id
+
+        if not comm.is_model_parallel_initailized():
+            comm.initialize_model_parallel(tensor_para_size, pipeline_para_size)
+
+        print('Initializing FasterTransformer')
+        self.model = ParallelGPT(
+            head_num,
+            size_per_head,
+            vocab_size,
+            start_id,
+            end_id,
+            layer_num,
+            max_seq_len,
+            tensor_para_size,
+            pipeline_para_size,
+            lib_path=ft_lib_path,
+            inference_data_type=inference_data_type,
+            int8_mode=int8_mode,
+            weights_data_type=weights_data_type,
+            layernorm_eps=layernorm_eps,
+            use_attention_linear_bias=use_attention_linear_bias,
+            has_positional_encoding=has_positional_encoding,
+            shared_contexts_ratio=shared_contexts_ratio)
+        print(f'Loading FT checkpoint from {model_path}')
+        if not self.model.load(ckpt_path=model_path):
+            raise RuntimeError(
+                'Could not load model from a FasterTransformer checkpoint')
+        print('FT initialization complete')
+
+        self.device = comm.get_device()
+
+    def _parse_model_request(self, model_request: Dict) -> Tuple[str, Dict]:
+        if self.INPUT_KEY not in model_request:
+            raise RuntimeError(
+                f'"{self.INPUT_KEY}" must be provided to generate call')
+
+        generate_input = model_request[self.INPUT_KEY]
+
+        # Set default generate kwargs
+        generate_kwargs = copy.deepcopy(self.INPUT_GENERATE_KWARGS)
+        # If request contains any additional kwargs, add them to generate_kwargs
+        for k, v in model_request.get(self.PARAMETERS_KEY, {}).items():
+            generate_kwargs[k] = v
+
+        return generate_input, generate_kwargs
+
+    def _map_input_generate_params_to_ft_params(self,
+                                                generate_kwargs: Dict) -> Dict:
+        # Use the default ft args as the base
+        ft_args = copy.deepcopy(self.FT_GENERATE_ARGS)
+
+        # max_new_tokens is called output_len in FasterTransformer
+        ft_args['output_len'] = generate_kwargs['max_new_tokens']
+
+        # top_p, top_k, and temperature map 1:1
+        ft_args['top_p'] = generate_kwargs['top_p']
+        ft_args['top_k'] = generate_kwargs['top_k']
+        ft_args['temperature'] = generate_kwargs['temperature']
+
+        return ft_args
+
+    def _convert_kwargs(self, generate_inputs: List[str],
+                        generate_kwargs: Dict):
+        """Converts generate_kwargs into required torch types."""
+        batch_size = len(generate_inputs)
+
+        # Allow 'max_length' to be an alias for 'output_len'. Makes it less
+        # likely clients break when we swap in the FT handler.
+        if 'max_length' in generate_kwargs:
+            generate_kwargs['output_len'] = generate_kwargs['max_length']
+            del generate_kwargs['max_length']
+
+        # Integer args may be floats if the values are from a json payload.
+        generate_kwargs['output_len'] = int(generate_kwargs['output_len'])
+        generate_kwargs['top_k'] = int(generate_kwargs['top_k']) * torch.ones(
+            batch_size, dtype=torch.int32)
+        generate_kwargs['top_p'] *= torch.ones(batch_size, dtype=torch.float32)
+        generate_kwargs['temperature'] *= torch.ones(batch_size,
+                                                     dtype=torch.float32)
+        repetition_penalty = generate_kwargs['repetition_penalty']
+        generate_kwargs[
+            'repetition_penalty'] = None if repetition_penalty == 1.0 else repetition_penalty * torch.ones(
+                batch_size, dtype=torch.float32)
+        presence_penalty = generate_kwargs['presence_penalty']
+        generate_kwargs[
+            'presence_penalty'] = None if presence_penalty == 0.0 else presence_penalty * torch.ones(
+                batch_size, dtype=torch.float32)
+        generate_kwargs['beam_search_diversity_rate'] *= torch.ones(
+            batch_size, dtype=torch.float32)
+        generate_kwargs['len_penalty'] *= torch.ones(size=[batch_size],
+                                                     dtype=torch.float32)
+        generate_kwargs['min_length'] = int(
+            generate_kwargs['min_length']) * torch.ones(size=[batch_size],
+                                                        dtype=torch.int32)
+        if generate_kwargs['random_seed']:
+            generate_kwargs['random_seed'] = torch.randint(0,
+                                                           10000,
+                                                           size=[batch_size],
+                                                           dtype=torch.int64)
+
+    def _parse_model_requests(
+            self, model_requests: List[Dict]) -> Tuple[List[str], Dict]:
+        """Splits requests into a flat list of inputs and merged kwargs."""
+        generate_inputs = []
+        generate_kwargs = {}
+        for req in model_requests:
+            generate_input, generate_kwarg = self._parse_model_request(req)
+            generate_inputs += [generate_input]
+
+            # In the case of batched requests, make sure that all requests in the batch
+            # have the same generate kwargs and if not throw an error
+            for k, v in generate_kwarg.items():
+                if k in generate_kwargs and generate_kwargs[k] != v:
+                    raise RuntimeError(
+                        f'Request has conflicting values for kwarg {k}')
+                generate_kwargs[k] = v
+
+        return generate_inputs, generate_kwargs
+
+    @torch.no_grad()
+    def predict(self, model_requests: List[Dict]) -> List[str]:
+        generate_inputs, input_generate_kwargs = self._parse_model_requests(
+            model_requests)
+        # Map our input generate kwargs to the ones FasterTransformer expects
+        generate_kwargs = self._map_input_generate_params_to_ft_params(
+            input_generate_kwargs)
+        self._convert_kwargs(generate_inputs, generate_kwargs)
+
+        start_ids = [
+            torch.tensor(self.tokenizer.encode(c),
+                         dtype=torch.int32,
+                         device=self.device) for c in generate_inputs
+        ]
+        start_lengths = [len(ids) for ids in start_ids]
+        start_ids = pad_sequence(start_ids,
+                                 batch_first=True,
+                                 padding_value=self.end_id)
+        start_lengths = torch.IntTensor(start_lengths)
+        tokens_batch = self.model(start_ids, start_lengths, **generate_kwargs)
+        outputs = []
+        for tokens in tokens_batch:
+            for beam_id in range(generate_kwargs['beam_width']):
+                # Do not exclude context input from the output
+                # token = tokens[beam_id][start_lengths[i]:]
+                token = tokens[beam_id]
+                # stop at end_id; This is the same as eos_token_id
+                token = token[token != self.end_id]
+                output = self.tokenizer.decode(token)
+                outputs.append(output)
+        return outputs
+
+    def predict_stream(self, **model_requests: Dict):
+        raise RuntimeError('Streaming is not supported with FasterTransformer!')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawTextHelpFormatter)
+
+    parser.add_argument(
+        '--ft_lib_path',
+        type=str,
+        required=True,
+        help=
+        'Path to the libth_transformer dynamic lib file(e.g., build/lib/libth_transformer.so.'
+    )
+    parser.add_argument(
+        '--name_or_dir',
+        '-i',
+        type=str,
+        help=
+        'HF hub Model name (e.g., mosaicml/mpt-7b) or local dir path to load checkpoint from',
+        required=True)
+    parser.add_argument('--inference_data_type',
+                        '--data_type',
+                        type=str,
+                        choices=['fp32', 'fp16', 'bf16'],
+                        default='bf16')
+    parser.add_argument(
+        '--int8_mode',
+        type=int,
+        default=0,
+        choices=[0, 1],
+        help=
+        'The level of quantization to perform. 0: No quantization. All computation in data_type. 1: Quantize weights to int8, all compute occurs in fp16/bf16. Not supported when data_type is fp32'
+    )
+    parser.add_argument('--gpus',
+                        type=int,
+                        default=1,
+                        help='The number of gpus to use for inference.')
+
+    parser.add_argument(
+        '--force',
+        action='store_true',
+        help=
+        'Force conversion to FT even if some features may not work as expected in FT'
+    )
+
+    args = parser.parse_args()
+
+    s3_path = None
+    hf_path = None
+    if 's3' in args.name_or_dir:
+        s3_path = args.name_or_dir
+    else:
+        hf_path = args.name_or_dir
+
+    if not comm.is_model_parallel_initailized():
+        # pipeline parallelism is 1 for now
+        comm.initialize_model_parallel(tensor_para_size=args.gpus,
+                                       pipeline_para_size=1)
+
+    if comm.get_rank() == 0:
+        download_convert(s3_path=s3_path,
+                         hf_path=hf_path,
+                         gpus=args.gpus,
+                         force_conversion=args.force)
+    if dist.is_initialized():
+        dist.barrier()
+
+    model_handle = MPTFTModelHandler(args.name_or_dir, args.ft_lib_path,
+                                     args.inference_data_type, args.int8_mode,
+                                     args.gpus)
+    inputs = {'input': 'Who is the president of the USA?'}
+    out = model_handle.predict([inputs])
+    print(out[0])

From 36a57aaaa74abaaf33ac6727b066cdc5c531ed79 Mon Sep 17 00:00:00 2001
From: Ajay Saini <ajay@mosaicml.com>
Date: Wed, 21 Jun 2023 14:25:35 -0700
Subject: [PATCH 02/12] Hosted handler for MPT (#383)

---
 .../mpt/mpt_hosted_handler.py                 | 424 ++++++++++++++++++
 1 file changed, 424 insertions(+)
 create mode 100644 examples/inference-deployments/mpt/mpt_hosted_handler.py

diff --git a/examples/inference-deployments/mpt/mpt_hosted_handler.py b/examples/inference-deployments/mpt/mpt_hosted_handler.py
new file mode 100644
index 000000000..8866dd7c8
--- /dev/null
+++ b/examples/inference-deployments/mpt/mpt_hosted_handler.py
@@ -0,0 +1,424 @@
+# Copyright 2022 MosaicML Examples authors
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import configparser
+import copy
+import os
+from typing import Dict, List, Tuple, Optional
+from pathlib import Path
+from urllib.parse import urlparse
+
+import boto3
+import botocore
+import torch
+import torch.distributed as dist
+from FasterTransformer.examples.pytorch.gpt.utils.parallel_gpt import ParallelGPT  # yapf: disable # type: ignore
+from FasterTransformer.examples.pytorch.gpt.utils import comm  # yapf: disable # type: ignore
+from scripts.inference.convert_hf_mpt_to_ft import convert_mpt_to_ft  # yapf: disable # type: ignore
+from torch.nn.utils.rnn import pad_sequence
+from transformers import AutoTokenizer
+from huggingface_hub import snapshot_download
+
+LOCAL_CHECKPOINT_DIR = '/tmp/mpt'
+LOCAL_MODEL_PATH = os.path.join(LOCAL_CHECKPOINT_DIR, 'local_model')
+
+
+def download_convert(s3_path: Optional[str] = None,
+                     hf_path: Optional[str] = None,
+                     gpus: int = 1,
+                     force_conversion: bool = False):
+    """Download model and convert to FasterTransformer format.
+
+    Args:
+        s3_path (str): Path for model location in an s3 bucket.
+        hf_path (str): Name of the model as on HF hub (e.g., mosaicml/mpt-7b-instruct) or local folder name containing
+            the model (e.g., mpt-7b-instruct)
+        gpus (int): Number of gpus to use for inference (Default: 1)
+        force_conversion (bool): Force conversion to FT even if some features may not work as expected in FT (Default: False)
+    """
+    if not s3_path and not hf_path:
+        raise RuntimeError(
+            'Either s3_path or hf_path must be provided to download_convert')
+    model_name_or_path: str = ''
+    if s3_path:
+        # s3 creds need to already be present as env vars
+        s3 = boto3.client('s3')
+        model_name_or_path = LOCAL_MODEL_PATH
+
+        # Download model files
+        if os.path.exists(LOCAL_MODEL_PATH):
+            print(
+                f'[+] Path {LOCAL_MODEL_PATH} already exists, skipping download'
+            )
+        else:
+            Path(LOCAL_MODEL_PATH).mkdir(parents=True, exist_ok=True)
+
+            print(f'Downloading model from path: {s3_path}')
+
+            parsed_path = urlparse(s3_path)
+
+            objs = s3.list_objects_v2(
+                Bucket=parsed_path.netloc,
+                Prefix=parsed_path.path.lstrip('/'),
+            )
+            for obj in objs['Contents']:
+                file_key = obj['Key']
+                try:
+                    file_name = os.path.basename(file_key)
+                    s3.download_file(Bucket=parsed_path.netloc,
+                                     Key=file_key,
+                                     Filename=os.path.join(
+                                         LOCAL_MODEL_PATH, file_name))
+                except botocore.exceptions.ClientError as e:
+                    print(
+                        f'Error downloading file with key: {file_key} with error: {e}'
+                    )
+    elif hf_path:
+        print(f'Downloading HF model with name: {hf_path}')
+        model_name_or_path = hf_path
+        snapshot_download(repo_id=hf_path)
+
+    # This is the format the the conversion script saves the converted checkpoint in
+    local_ft_model_path = os.path.join(LOCAL_CHECKPOINT_DIR, f'{gpus}-gpu')
+    ckpt_config_path = os.path.join(local_ft_model_path, 'config.ini')
+
+    # Convert model to FT format
+    # If FT checkpoint doesn't exist, create it.
+    if not os.path.isfile(ckpt_config_path):
+        print('Converting model to FT format')
+        # Datatype of weights in the HF checkpoint
+        weight_data_type = 'fp32'
+        convert_mpt_to_ft(model_name_or_path, LOCAL_CHECKPOINT_DIR, gpus,
+                          weight_data_type, force_conversion)
+        if not os.path.isfile(ckpt_config_path):
+            raise RuntimeError('Failed to create FT checkpoint')
+    else:
+        print(f'Reusing existing FT checkpoint at {local_ft_model_path}')
+
+
+class MPTFTHostedModelHandler:
+    # This is what the user request will contain
+    INPUT_GENERATE_KWARGS = {
+        'max_new_tokens': 256,
+        'top_p': 0.95,
+        'top_k': 50,
+        'temperature': 0.8,
+    }
+
+    # These are the args we need to map the user request to before running generate()
+    # with FasterTransformer
+    FT_GENERATE_KWARGS = {
+        # Output sequence length to generate.
+        'output_len': 256,
+        # Beam width for beam search
+        'beam_width': 1,
+        # top k candidate number
+        'top_k': 50,
+        # top p probability threshold
+        'top_p': 0.95,
+        # temperature parameter
+        'temperature': 0.8,
+        # Penalty for repetitions
+        'repetition_penalty': 1.0,
+        # Presence penalty. Similar to repetition, but additive rather than multiplicative.
+        'presence_penalty': 0.0,
+        'beam_search_diversity_rate': 0.0,
+        'len_penalty': 0.0,
+        'bad_words_list': None,
+        # A minimum number of tokens to generate.
+        'min_length': 0,
+        # if True, use different random seed for sentences in a batch.
+        'random_seed': True
+    }
+
+    INPUT_KEY = 'inputs'
+    PARAMETERS_KEY = 'parameters'
+
+    def __init__(self,
+                 model_name_or_path: str,
+                 ft_lib_path: str,
+                 inference_data_type: str = 'bf16',
+                 int8_mode: int = 0,
+                 gpus: int = 1):
+        """Fastertransformer model handler for MPT foundation series.
+
+        Args:
+            model_name_or_path (str): Name of the model as on HF hub (e.g., mosaicml/mpt-7b-instruct) or local model name (e.g., mpt-7b-instruct)
+            ft_lib_path (str): Path to the libth_transformer dynamic lib file(.e.g., build/lib/libth_transformer.so).
+            inference_data_type (str): Data type to use for inference (Default: bf16)
+            int8_mode (int): The level of quantization to perform. 0: No quantization. All computation in data_type,
+                1: Quantize weights to int8, all compute occurs in fp16/bf16. Not supported when data_type is fp32
+            gpus (int): Number of gpus to use for inference (Default: 1)
+        """
+        self.model_name_or_path = model_name_or_path
+
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path,
+                                                       trust_remote_code=True)
+
+        # Make sure the seed on all ranks is the same. This is important.
+        # Multi-gpu generate calls will hang without this.
+        torch.manual_seed(0)
+
+        model_path = os.path.join(LOCAL_CHECKPOINT_DIR, f'{gpus}-gpu')
+        ckpt_config_path = os.path.join(model_path, 'config.ini')
+
+        ckpt_config = configparser.ConfigParser()
+        ckpt_config.read(ckpt_config_path)
+
+        # Disable this optimization.
+        # https://github.com/NVIDIA/FasterTransformer/blob/main/docs/gpt_guide.md#advanced-features
+        shared_contexts_ratio = 0.0
+
+        if 'gpt' in ckpt_config.keys():
+            head_num = ckpt_config.getint('gpt', 'head_num')
+            size_per_head = ckpt_config.getint('gpt', 'size_per_head')
+            vocab_size = ckpt_config.getint('gpt', 'vocab_size')
+            start_id = ckpt_config.getint('gpt', 'start_id')
+            end_id = ckpt_config.getint('gpt', 'end_id')
+            layer_num = ckpt_config.getint('gpt', 'num_layer')
+            max_seq_len = ckpt_config.getint('gpt', 'max_pos_seq_len')
+            weights_data_type = ckpt_config.get('gpt', 'weight_data_type')
+            tensor_para_size = ckpt_config.getint('gpt', 'tensor_para_size')
+            pipeline_para_size = ckpt_config.getint('gpt',
+                                                    'pipeline_para_size',
+                                                    fallback=1)
+            layernorm_eps = ckpt_config.getfloat('gpt',
+                                                 'layernorm_eps',
+                                                 fallback=1e-5)
+            use_attention_linear_bias = ckpt_config.getboolean(
+                'gpt', 'use_attention_linear_bias')
+            has_positional_encoding = ckpt_config.getboolean(
+                'gpt', 'has_positional_encoding')
+        else:
+            raise RuntimeError(
+                'Unexpected config.ini for the FT checkpoint. Expected FT checkpoint to contain the `gpt` key.'
+            )
+
+        self.end_id = end_id
+
+        if not comm.is_model_parallel_initailized():
+            comm.initialize_model_parallel(tensor_para_size, pipeline_para_size)
+
+        print('Initializing FasterTransformer')
+        self.model = ParallelGPT(
+            head_num,
+            size_per_head,
+            vocab_size,
+            start_id,
+            end_id,
+            layer_num,
+            max_seq_len,
+            tensor_para_size,
+            pipeline_para_size,
+            lib_path=ft_lib_path,
+            inference_data_type=inference_data_type,
+            int8_mode=int8_mode,
+            weights_data_type=weights_data_type,
+            layernorm_eps=layernorm_eps,
+            use_attention_linear_bias=use_attention_linear_bias,
+            has_positional_encoding=has_positional_encoding,
+            shared_contexts_ratio=shared_contexts_ratio)
+        print(f'Loading FT checkpoint from {model_path}')
+        if not self.model.load(ckpt_path=model_path):
+            raise RuntimeError(
+                'Could not load model from a FasterTransformer checkpoint')
+        print('FT initialization complete')
+
+        self.device = comm.get_device()
+
+    def _parse_model_request(self, model_request: Dict) -> Tuple[str, Dict]:
+        if self.INPUT_KEY not in model_request:
+            raise RuntimeError(
+                f'"{self.INPUT_KEY}" must be provided to generate call')
+
+        generate_input = model_request[self.INPUT_KEY]
+
+        # Set default generate kwargs
+        generate_kwargs = copy.deepcopy(self.INPUT_GENERATE_KWARGS)
+        # If request contains any additional kwargs, add them to generate_kwargs
+        for k, v in model_request.get(self.PARAMETERS_KEY, {}).items():
+            generate_kwargs[k] = v
+
+        return generate_input, generate_kwargs
+
+    def _map_input_generate_params_to_ft_params(self,
+                                                generate_kwargs: Dict) -> Dict:
+        # Use the default ft args as the base
+        ft_args = copy.deepcopy(self.FT_GENERATE_ARGS)
+
+        # max_new_tokens is called output_len in FasterTransformer
+        ft_args['output_len'] = generate_kwargs['max_new_tokens']
+
+        # top_p, top_k, and temperature map 1:1
+        ft_args['top_p'] = generate_kwargs['top_p']
+        ft_args['top_k'] = generate_kwargs['top_k']
+        ft_args['temperature'] = generate_kwargs['temperature']
+
+        return ft_args
+
+    def _convert_kwargs(self, generate_inputs: List[str],
+                        generate_kwargs: Dict):
+        """Converts generate_kwargs into required torch types."""
+        batch_size = len(generate_inputs)
+
+        # Allow 'max_length' to be an alias for 'output_len'. Makes it less
+        # likely clients break when we swap in the FT handler.
+        if 'max_length' in generate_kwargs:
+            generate_kwargs['output_len'] = generate_kwargs['max_length']
+            del generate_kwargs['max_length']
+
+        # Integer args may be floats if the values are from a json payload.
+        generate_kwargs['output_len'] = int(generate_kwargs['output_len'])
+        generate_kwargs['top_k'] = int(generate_kwargs['top_k']) * torch.ones(
+            batch_size, dtype=torch.int32)
+        generate_kwargs['top_p'] *= torch.ones(batch_size, dtype=torch.float32)
+        generate_kwargs['temperature'] *= torch.ones(batch_size,
+                                                     dtype=torch.float32)
+        repetition_penalty = generate_kwargs['repetition_penalty']
+        generate_kwargs[
+            'repetition_penalty'] = None if repetition_penalty == 1.0 else repetition_penalty * torch.ones(
+                batch_size, dtype=torch.float32)
+        presence_penalty = generate_kwargs['presence_penalty']
+        generate_kwargs[
+            'presence_penalty'] = None if presence_penalty == 0.0 else presence_penalty * torch.ones(
+                batch_size, dtype=torch.float32)
+        generate_kwargs['beam_search_diversity_rate'] *= torch.ones(
+            batch_size, dtype=torch.float32)
+        generate_kwargs['len_penalty'] *= torch.ones(size=[batch_size],
+                                                     dtype=torch.float32)
+        generate_kwargs['min_length'] = int(
+            generate_kwargs['min_length']) * torch.ones(size=[batch_size],
+                                                        dtype=torch.int32)
+        if generate_kwargs['random_seed']:
+            generate_kwargs['random_seed'] = torch.randint(0,
+                                                           10000,
+                                                           size=[batch_size],
+                                                           dtype=torch.int64)
+
+    def _parse_model_requests(
+            self, model_requests: List[Dict]) -> Tuple[List[str], Dict]:
+        """Splits requests into a flat list of inputs and merged kwargs."""
+        generate_inputs = []
+        generate_kwargs = {}
+        for req in model_requests:
+            generate_input, generate_kwarg = self._parse_model_request(req)
+            generate_inputs += [generate_input]
+
+            # In the case of batched requests, make sure that all requests in the batch
+            # have the same generate kwargs and if not throw an error
+            for k, v in generate_kwarg.items():
+                if k in generate_kwargs and generate_kwargs[k] != v:
+                    raise RuntimeError(
+                        f'Request has conflicting values for kwarg {k}')
+                generate_kwargs[k] = v
+
+        return generate_inputs, generate_kwargs
+
+    @torch.no_grad()
+    def predict(self, model_requests: List[Dict]) -> List[str]:
+        generate_inputs, input_generate_kwargs = self._parse_model_requests(
+            model_requests)
+        # Map our input generate kwargs to the ones FasterTransformer expects
+        generate_kwargs = self._map_input_generate_params_to_ft_params(
+            input_generate_kwargs)
+        self._convert_kwargs(generate_inputs, generate_kwargs)
+
+        start_ids = [
+            torch.tensor(self.tokenizer.encode(c),
+                         dtype=torch.int32,
+                         device=self.device) for c in generate_inputs
+        ]
+        start_lengths = [len(ids) for ids in start_ids]
+        start_ids = pad_sequence(start_ids,
+                                 batch_first=True,
+                                 padding_value=self.end_id)
+        start_lengths = torch.IntTensor(start_lengths)
+        tokens_batch = self.model(start_ids, start_lengths, **generate_kwargs)
+        outputs = []
+        for tokens in tokens_batch:
+            for beam_id in range(generate_kwargs['beam_width']):
+                # Do not exclude context input from the output
+                # token = tokens[beam_id][start_lengths[i]:]
+                token = tokens[beam_id]
+                # stop at end_id; This is the same as eos_token_id
+                token = token[token != self.end_id]
+                output = self.tokenizer.decode(token)
+                outputs.append(output)
+        return outputs
+
+    def predict_stream(self, **model_requests: Dict):
+        raise RuntimeError('Streaming is not supported with FasterTransformer!')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawTextHelpFormatter)
+
+    parser.add_argument(
+        '--ft_lib_path',
+        type=str,
+        required=True,
+        help=
+        'Path to the libth_transformer dynamic lib file(e.g., build/lib/libth_transformer.so.'
+    )
+    parser.add_argument(
+        '--name_or_dir',
+        '-i',
+        type=str,
+        help=
+        'HF hub Model name (e.g., mosaicml/mpt-7b) or local dir path to load checkpoint from',
+        required=True)
+    parser.add_argument('--inference_data_type',
+                        '--data_type',
+                        type=str,
+                        choices=['fp32', 'fp16', 'bf16'],
+                        default='bf16')
+    parser.add_argument(
+        '--int8_mode',
+        type=int,
+        default=0,
+        choices=[0, 1],
+        help=
+        'The level of quantization to perform. 0: No quantization. All computation in data_type. 1: Quantize weights to int8, all compute occurs in fp16/bf16. Not supported when data_type is fp32'
+    )
+    parser.add_argument('--gpus',
+                        type=int,
+                        default=1,
+                        help='The number of gpus to use for inference.')
+
+    parser.add_argument(
+        '--force',
+        action='store_true',
+        help=
+        'Force conversion to FT even if some features may not work as expected in FT'
+    )
+
+    args = parser.parse_args()
+
+    s3_path = None
+    hf_path = None
+    if 's3' in args.name_or_dir:
+        s3_path = args.name_or_dir
+    else:
+        hf_path = args.name_or_dir
+
+    if not comm.is_model_parallel_initailized():
+        # pipeline parallelism is 1 for now
+        comm.initialize_model_parallel(tensor_para_size=args.gpus,
+                                       pipeline_para_size=1)
+
+    if comm.get_rank() == 0:
+        download_convert(s3_path=s3_path,
+                         hf_path=hf_path,
+                         gpus=args.gpus,
+                         force_conversion=args.force)
+    if dist.is_initialized():
+        dist.barrier()
+
+    model_handle = MPTFTModelHandler(args.name_or_dir, args.ft_lib_path,
+                                     args.inference_data_type, args.int8_mode,
+                                     args.gpus)
+    inputs = {'input': 'Who is the president of the USA?'}
+    out = model_handle.predict([inputs])
+    print(out[0])

From 1d8baafa1c9ce87ec3bff9e355b59bf7dd5f5827 Mon Sep 17 00:00:00 2001
From: Ajay Saini <ajay@mosaicml.com>
Date: Wed, 21 Jun 2023 15:12:56 -0700
Subject: [PATCH 03/12] Use max_tokens

---
 examples/inference-deployments/mpt/mpt_hosted_handler.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/inference-deployments/mpt/mpt_hosted_handler.py b/examples/inference-deployments/mpt/mpt_hosted_handler.py
index 8866dd7c8..e78679853 100644
--- a/examples/inference-deployments/mpt/mpt_hosted_handler.py
+++ b/examples/inference-deployments/mpt/mpt_hosted_handler.py
@@ -100,7 +100,7 @@ def download_convert(s3_path: Optional[str] = None,
 class MPTFTHostedModelHandler:
     # This is what the user request will contain
     INPUT_GENERATE_KWARGS = {
-        'max_new_tokens': 256,
+        'max_tokens': 256,
         'top_p': 0.95,
         'top_k': 50,
         'temperature': 0.8,
@@ -247,8 +247,8 @@ def _map_input_generate_params_to_ft_params(self,
         # Use the default ft args as the base
         ft_args = copy.deepcopy(self.FT_GENERATE_ARGS)
 
-        # max_new_tokens is called output_len in FasterTransformer
-        ft_args['output_len'] = generate_kwargs['max_new_tokens']
+        # max_tokens is called output_len in FasterTransformer
+        ft_args['output_len'] = generate_kwargs['max_tokens']
 
         # top_p, top_k, and temperature map 1:1
         ft_args['top_p'] = generate_kwargs['top_p']

From 8a16f8d6f4fa95ba23969fa8dffd88d7ce393455 Mon Sep 17 00:00:00 2001
From: Ajay Saini <ajay@mosaicml.com>
Date: Wed, 21 Jun 2023 16:17:51 -0700
Subject: [PATCH 04/12] Naming

---
 examples/inference-deployments/mpt/mpt_hosted_handler.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/inference-deployments/mpt/mpt_hosted_handler.py b/examples/inference-deployments/mpt/mpt_hosted_handler.py
index e78679853..61bd6ef40 100644
--- a/examples/inference-deployments/mpt/mpt_hosted_handler.py
+++ b/examples/inference-deployments/mpt/mpt_hosted_handler.py
@@ -100,7 +100,7 @@ def download_convert(s3_path: Optional[str] = None,
 class MPTFTHostedModelHandler:
     # This is what the user request will contain
     INPUT_GENERATE_KWARGS = {
-        'max_tokens': 256,
+        'max_new_tokens': 256,
         'top_p': 0.95,
         'top_k': 50,
         'temperature': 0.8,
@@ -132,7 +132,7 @@ class MPTFTHostedModelHandler:
         'random_seed': True
     }
 
-    INPUT_KEY = 'inputs'
+    INPUT_KEY = 'input'
     PARAMETERS_KEY = 'parameters'
 
     def __init__(self,
@@ -247,8 +247,8 @@ def _map_input_generate_params_to_ft_params(self,
         # Use the default ft args as the base
         ft_args = copy.deepcopy(self.FT_GENERATE_ARGS)
 
-        # max_tokens is called output_len in FasterTransformer
-        ft_args['output_len'] = generate_kwargs['max_tokens']
+        # max_new_tokens is called output_len in FasterTransformer
+        ft_args['output_len'] = generate_kwargs['max_new_tokens']
 
         # top_p, top_k, and temperature map 1:1
         ft_args['top_p'] = generate_kwargs['top_p']

From 425b1e5faff5d9c46f65a182b79b48b745998cc7 Mon Sep 17 00:00:00 2001
From: Ajay Saini <ajay@mosaicml.com>
Date: Wed, 21 Jun 2023 16:19:42 -0700
Subject: [PATCH 05/12] Input naming  (#385)

---
 examples/inference-deployments/mpt/mpt_hosted_handler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/inference-deployments/mpt/mpt_hosted_handler.py b/examples/inference-deployments/mpt/mpt_hosted_handler.py
index 8866dd7c8..61bd6ef40 100644
--- a/examples/inference-deployments/mpt/mpt_hosted_handler.py
+++ b/examples/inference-deployments/mpt/mpt_hosted_handler.py
@@ -132,7 +132,7 @@ class MPTFTHostedModelHandler:
         'random_seed': True
     }
 
-    INPUT_KEY = 'inputs'
+    INPUT_KEY = 'input'
     PARAMETERS_KEY = 'parameters'
 
     def __init__(self,

From 8d3fcebf9e1d54054114089e8d832bbb5b6c7507 Mon Sep 17 00:00:00 2001
From: Ajay Saini <ajay@mosaicml.com>
Date: Wed, 21 Jun 2023 16:40:17 -0700
Subject: [PATCH 06/12] Args naming

---
 examples/inference-deployments/mpt/mpt_hosted_handler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/inference-deployments/mpt/mpt_hosted_handler.py b/examples/inference-deployments/mpt/mpt_hosted_handler.py
index 61bd6ef40..88809dd72 100644
--- a/examples/inference-deployments/mpt/mpt_hosted_handler.py
+++ b/examples/inference-deployments/mpt/mpt_hosted_handler.py
@@ -245,7 +245,7 @@ def _parse_model_request(self, model_request: Dict) -> Tuple[str, Dict]:
     def _map_input_generate_params_to_ft_params(self,
                                                 generate_kwargs: Dict) -> Dict:
         # Use the default ft args as the base
-        ft_args = copy.deepcopy(self.FT_GENERATE_ARGS)
+        ft_args = copy.deepcopy(self.FT_GENERATE_KWARGS)
 
         # max_new_tokens is called output_len in FasterTransformer
         ft_args['output_len'] = generate_kwargs['max_new_tokens']

From 70d91518322d01e4c316eedb72848fe2461cef71 Mon Sep 17 00:00:00 2001
From: Ajay Saini <ajay@mosaicml.com>
Date: Wed, 21 Jun 2023 16:41:11 -0700
Subject: [PATCH 07/12] Fixed FT args naming  (#387)

---
 examples/inference-deployments/mpt/mpt_hosted_handler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/inference-deployments/mpt/mpt_hosted_handler.py b/examples/inference-deployments/mpt/mpt_hosted_handler.py
index 61bd6ef40..88809dd72 100644
--- a/examples/inference-deployments/mpt/mpt_hosted_handler.py
+++ b/examples/inference-deployments/mpt/mpt_hosted_handler.py
@@ -245,7 +245,7 @@ def _parse_model_request(self, model_request: Dict) -> Tuple[str, Dict]:
     def _map_input_generate_params_to_ft_params(self,
                                                 generate_kwargs: Dict) -> Dict:
         # Use the default ft args as the base
-        ft_args = copy.deepcopy(self.FT_GENERATE_ARGS)
+        ft_args = copy.deepcopy(self.FT_GENERATE_KWARGS)
 
         # max_new_tokens is called output_len in FasterTransformer
         ft_args['output_len'] = generate_kwargs['max_new_tokens']

From eebef2c5745a9b3e4ed14394f0af90ca6f9aaa3a Mon Sep 17 00:00:00 2001
From: Ajay Saini <ajay@mosaicml.com>
Date: Wed, 21 Jun 2023 19:34:54 -0700
Subject: [PATCH 08/12] Exclude input from output (#388)

* Enable CodeQL for pull requests (#374)

This reverts commit 1a0492376cb1fc0b83e6e5db1e6edbb06c60c175.

* Update

---------

Co-authored-by: bandish-shah <86627118+bandish-shah@users.noreply.github.com>
---
 .github/workflows/codeql-analysis.yml                 |  3 +++
 .../inference-deployments/mpt/mpt_7b_ft_handler.py    | 11 +++++++----
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index fbeef28d2..f3ecf9a1f 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -14,6 +14,9 @@ name: "CodeQL"
 on:
   push:
     branches: [ main ]
+  pull_request:
+    # The branches below must be a subset of the branches above
+    branches: [ main ]
   schedule:
     - cron: '0 9 * * 1'  # Every Monday at 09:00 (9:00 AM)
 
diff --git a/examples/inference-deployments/mpt/mpt_7b_ft_handler.py b/examples/inference-deployments/mpt/mpt_7b_ft_handler.py
index 0f2621c5b..36fb88a50 100644
--- a/examples/inference-deployments/mpt/mpt_7b_ft_handler.py
+++ b/examples/inference-deployments/mpt/mpt_7b_ft_handler.py
@@ -307,11 +307,14 @@ def predict(self, model_requests: List[Dict]) -> List[str]:
         start_lengths = torch.IntTensor(start_lengths)
         tokens_batch = self.model(start_ids, start_lengths, **generate_kwargs)
         outputs = []
-        for tokens in tokens_batch:
+        for i, tokens in enumerate(tokens_batch):
             for beam_id in range(generate_kwargs['beam_width']):
-                # Do not exclude context input from the output
-                # token = tokens[beam_id][start_lengths[i]:]
-                token = tokens[beam_id]
+                # Exclude context input from the output
+                token = tokens[beam_id][start_lengths[i]:]
+
+                # Do this to exclude context input from the output
+                # token = tokens[beam_id]
+
                 # stop at end_id; This is the same as eos_token_id
                 token = token[token != self.end_id]
                 output = self.tokenizer.decode(token)

From 6af4013893c7f2d4d1d090f493d5becae3d1a7f9 Mon Sep 17 00:00:00 2001
From: Margaret Qian <ymqian@gmail.com>
Date: Wed, 21 Jun 2023 19:47:25 -0700
Subject: [PATCH 09/12] Add skip_special_tokens (#386)

---
 examples/inference-deployments/mpt/mpt_hosted_handler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/inference-deployments/mpt/mpt_hosted_handler.py b/examples/inference-deployments/mpt/mpt_hosted_handler.py
index 88809dd72..fcb3144fe 100644
--- a/examples/inference-deployments/mpt/mpt_hosted_handler.py
+++ b/examples/inference-deployments/mpt/mpt_hosted_handler.py
@@ -343,7 +343,7 @@ def predict(self, model_requests: List[Dict]) -> List[str]:
                 token = tokens[beam_id]
                 # stop at end_id; This is the same as eos_token_id
                 token = token[token != self.end_id]
-                output = self.tokenizer.decode(token)
+                output = self.tokenizer.decode(token, skip_special_tokens=True)
                 outputs.append(output)
         return outputs
 

From dceba1ad8c05582bc9e4853ec69623b558b26aed Mon Sep 17 00:00:00 2001
From: Ajay Saini <ajay@mosaicml.com>
Date: Wed, 21 Jun 2023 19:55:32 -0700
Subject: [PATCH 10/12] Exclude input tokens

---
 .../inference-deployments/mpt/mpt_hosted_handler.py   | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/examples/inference-deployments/mpt/mpt_hosted_handler.py b/examples/inference-deployments/mpt/mpt_hosted_handler.py
index fcb3144fe..ef2181859 100644
--- a/examples/inference-deployments/mpt/mpt_hosted_handler.py
+++ b/examples/inference-deployments/mpt/mpt_hosted_handler.py
@@ -336,11 +336,14 @@ def predict(self, model_requests: List[Dict]) -> List[str]:
         start_lengths = torch.IntTensor(start_lengths)
         tokens_batch = self.model(start_ids, start_lengths, **generate_kwargs)
         outputs = []
-        for tokens in tokens_batch:
+        for i, tokens in enumerate(tokens_batch):
             for beam_id in range(generate_kwargs['beam_width']):
-                # Do not exclude context input from the output
-                # token = tokens[beam_id][start_lengths[i]:]
-                token = tokens[beam_id]
+                # Exclude context input from the output
+                token = tokens[beam_id][start_lengths[i]:]
+
+                # Do this to exclude context input from the output
+                # token = tokens[beam_id]
+
                 # stop at end_id; This is the same as eos_token_id
                 token = token[token != self.end_id]
                 output = self.tokenizer.decode(token, skip_special_tokens=True)

From c98b5ce713645d96c2663a53ff879ace66312dbc Mon Sep 17 00:00:00 2001
From: Ajay Saini <ajay@mosaicml.com>
Date: Wed, 21 Jun 2023 19:58:14 -0700
Subject: [PATCH 11/12] Exclude input tokens in output (#389)

---
 .../inference-deployments/mpt/mpt_hosted_handler.py   | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/examples/inference-deployments/mpt/mpt_hosted_handler.py b/examples/inference-deployments/mpt/mpt_hosted_handler.py
index fcb3144fe..ef2181859 100644
--- a/examples/inference-deployments/mpt/mpt_hosted_handler.py
+++ b/examples/inference-deployments/mpt/mpt_hosted_handler.py
@@ -336,11 +336,14 @@ def predict(self, model_requests: List[Dict]) -> List[str]:
         start_lengths = torch.IntTensor(start_lengths)
         tokens_batch = self.model(start_ids, start_lengths, **generate_kwargs)
         outputs = []
-        for tokens in tokens_batch:
+        for i, tokens in enumerate(tokens_batch):
             for beam_id in range(generate_kwargs['beam_width']):
-                # Do not exclude context input from the output
-                # token = tokens[beam_id][start_lengths[i]:]
-                token = tokens[beam_id]
+                # Exclude context input from the output
+                token = tokens[beam_id][start_lengths[i]:]
+
+                # Do this to exclude context input from the output
+                # token = tokens[beam_id]
+
                 # stop at end_id; This is the same as eos_token_id
                 token = token[token != self.end_id]
                 output = self.tokenizer.decode(token, skip_special_tokens=True)

From 101552d5ba9a2f622c17e1073788b2b0200d9e38 Mon Sep 17 00:00:00 2001
From: Ajay Saini <ajay@mosaicml.com>
Date: Wed, 21 Jun 2023 20:28:07 -0700
Subject: [PATCH 12/12] Updated

---
 .../inference-deployments/mpt/mpt_hosted_handler.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/examples/inference-deployments/mpt/mpt_hosted_handler.py b/examples/inference-deployments/mpt/mpt_hosted_handler.py
index ef2181859..a9602394e 100644
--- a/examples/inference-deployments/mpt/mpt_hosted_handler.py
+++ b/examples/inference-deployments/mpt/mpt_hosted_handler.py
@@ -140,7 +140,8 @@ def __init__(self,
                  ft_lib_path: str,
                  inference_data_type: str = 'bf16',
                  int8_mode: int = 0,
-                 gpus: int = 1):
+                 gpus: int = 1,
+                 exclude_input_from_output: bool = False):
         """Fastertransformer model handler for MPT foundation series.
 
         Args:
@@ -150,6 +151,7 @@ def __init__(self,
             int8_mode (int): The level of quantization to perform. 0: No quantization. All computation in data_type,
                 1: Quantize weights to int8, all compute occurs in fp16/bf16. Not supported when data_type is fp32
             gpus (int): Number of gpus to use for inference (Default: 1)
+            exclude_input_from_output (bool): True to exclude input from the model output, false otherwise.
         """
         self.model_name_or_path = model_name_or_path
 
@@ -227,6 +229,8 @@ def __init__(self,
 
         self.device = comm.get_device()
 
+        self.exclude_input_from_output = exclude_input_from_output
+
     def _parse_model_request(self, model_request: Dict) -> Tuple[str, Dict]:
         if self.INPUT_KEY not in model_request:
             raise RuntimeError(
@@ -338,11 +342,10 @@ def predict(self, model_requests: List[Dict]) -> List[str]:
         outputs = []
         for i, tokens in enumerate(tokens_batch):
             for beam_id in range(generate_kwargs['beam_width']):
+                token = tokens[beam_id]
                 # Exclude context input from the output
-                token = tokens[beam_id][start_lengths[i]:]
-
-                # Do this to exclude context input from the output
-                # token = tokens[beam_id]
+                if self.exclude_input_from_output:
+                    token = token[start_lengths[i]:]
 
                 # stop at end_id; This is the same as eos_token_id
                 token = token[token != self.end_id]