From 253a252e96646c0785eb8d156ec4bd3085f4dcb8 Mon Sep 17 00:00:00 2001 From: Ajay Saini Date: Wed, 21 Jun 2023 13:22:49 -0700 Subject: [PATCH 01/12] Hosted handler for MPT --- .../mpt/mpt_hosted_handler.py | 424 ++++++++++++++++++ 1 file changed, 424 insertions(+) create mode 100644 examples/inference-deployments/mpt/mpt_hosted_handler.py diff --git a/examples/inference-deployments/mpt/mpt_hosted_handler.py b/examples/inference-deployments/mpt/mpt_hosted_handler.py new file mode 100644 index 000000000..8866dd7c8 --- /dev/null +++ b/examples/inference-deployments/mpt/mpt_hosted_handler.py @@ -0,0 +1,424 @@ +# Copyright 2022 MosaicML Examples authors +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import configparser +import copy +import os +from typing import Dict, List, Tuple, Optional +from pathlib import Path +from urllib.parse import urlparse + +import boto3 +import botocore +import torch +import torch.distributed as dist +from FasterTransformer.examples.pytorch.gpt.utils.parallel_gpt import ParallelGPT # yapf: disable # type: ignore +from FasterTransformer.examples.pytorch.gpt.utils import comm # yapf: disable # type: ignore +from scripts.inference.convert_hf_mpt_to_ft import convert_mpt_to_ft # yapf: disable # type: ignore +from torch.nn.utils.rnn import pad_sequence +from transformers import AutoTokenizer +from huggingface_hub import snapshot_download + +LOCAL_CHECKPOINT_DIR = '/tmp/mpt' +LOCAL_MODEL_PATH = os.path.join(LOCAL_CHECKPOINT_DIR, 'local_model') + + +def download_convert(s3_path: Optional[str] = None, + hf_path: Optional[str] = None, + gpus: int = 1, + force_conversion: bool = False): + """Download model and convert to FasterTransformer format. + + Args: + s3_path (str): Path for model location in an s3 bucket. + hf_path (str): Name of the model as on HF hub (e.g., mosaicml/mpt-7b-instruct) or local folder name containing + the model (e.g., mpt-7b-instruct) + gpus (int): Number of gpus to use for inference (Default: 1) + force_conversion (bool): Force conversion to FT even if some features may not work as expected in FT (Default: False) + """ + if not s3_path and not hf_path: + raise RuntimeError( + 'Either s3_path or hf_path must be provided to download_convert') + model_name_or_path: str = '' + if s3_path: + # s3 creds need to already be present as env vars + s3 = boto3.client('s3') + model_name_or_path = LOCAL_MODEL_PATH + + # Download model files + if os.path.exists(LOCAL_MODEL_PATH): + print( + f'[+] Path {LOCAL_MODEL_PATH} already exists, skipping download' + ) + else: + Path(LOCAL_MODEL_PATH).mkdir(parents=True, exist_ok=True) + + print(f'Downloading model from path: {s3_path}') + + parsed_path = urlparse(s3_path) + + objs = s3.list_objects_v2( + Bucket=parsed_path.netloc, + Prefix=parsed_path.path.lstrip('/'), + ) + for obj in objs['Contents']: + file_key = obj['Key'] + try: + file_name = os.path.basename(file_key) + s3.download_file(Bucket=parsed_path.netloc, + Key=file_key, + Filename=os.path.join( + LOCAL_MODEL_PATH, file_name)) + except botocore.exceptions.ClientError as e: + print( + f'Error downloading file with key: {file_key} with error: {e}' + ) + elif hf_path: + print(f'Downloading HF model with name: {hf_path}') + model_name_or_path = hf_path + snapshot_download(repo_id=hf_path) + + # This is the format the the conversion script saves the converted checkpoint in + local_ft_model_path = os.path.join(LOCAL_CHECKPOINT_DIR, f'{gpus}-gpu') + ckpt_config_path = os.path.join(local_ft_model_path, 'config.ini') + + # Convert model to FT format + # If FT checkpoint doesn't exist, create it. + if not os.path.isfile(ckpt_config_path): + print('Converting model to FT format') + # Datatype of weights in the HF checkpoint + weight_data_type = 'fp32' + convert_mpt_to_ft(model_name_or_path, LOCAL_CHECKPOINT_DIR, gpus, + weight_data_type, force_conversion) + if not os.path.isfile(ckpt_config_path): + raise RuntimeError('Failed to create FT checkpoint') + else: + print(f'Reusing existing FT checkpoint at {local_ft_model_path}') + + +class MPTFTHostedModelHandler: + # This is what the user request will contain + INPUT_GENERATE_KWARGS = { + 'max_new_tokens': 256, + 'top_p': 0.95, + 'top_k': 50, + 'temperature': 0.8, + } + + # These are the args we need to map the user request to before running generate() + # with FasterTransformer + FT_GENERATE_KWARGS = { + # Output sequence length to generate. + 'output_len': 256, + # Beam width for beam search + 'beam_width': 1, + # top k candidate number + 'top_k': 50, + # top p probability threshold + 'top_p': 0.95, + # temperature parameter + 'temperature': 0.8, + # Penalty for repetitions + 'repetition_penalty': 1.0, + # Presence penalty. Similar to repetition, but additive rather than multiplicative. + 'presence_penalty': 0.0, + 'beam_search_diversity_rate': 0.0, + 'len_penalty': 0.0, + 'bad_words_list': None, + # A minimum number of tokens to generate. + 'min_length': 0, + # if True, use different random seed for sentences in a batch. + 'random_seed': True + } + + INPUT_KEY = 'inputs' + PARAMETERS_KEY = 'parameters' + + def __init__(self, + model_name_or_path: str, + ft_lib_path: str, + inference_data_type: str = 'bf16', + int8_mode: int = 0, + gpus: int = 1): + """Fastertransformer model handler for MPT foundation series. + + Args: + model_name_or_path (str): Name of the model as on HF hub (e.g., mosaicml/mpt-7b-instruct) or local model name (e.g., mpt-7b-instruct) + ft_lib_path (str): Path to the libth_transformer dynamic lib file(.e.g., build/lib/libth_transformer.so). + inference_data_type (str): Data type to use for inference (Default: bf16) + int8_mode (int): The level of quantization to perform. 0: No quantization. All computation in data_type, + 1: Quantize weights to int8, all compute occurs in fp16/bf16. Not supported when data_type is fp32 + gpus (int): Number of gpus to use for inference (Default: 1) + """ + self.model_name_or_path = model_name_or_path + + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, + trust_remote_code=True) + + # Make sure the seed on all ranks is the same. This is important. + # Multi-gpu generate calls will hang without this. + torch.manual_seed(0) + + model_path = os.path.join(LOCAL_CHECKPOINT_DIR, f'{gpus}-gpu') + ckpt_config_path = os.path.join(model_path, 'config.ini') + + ckpt_config = configparser.ConfigParser() + ckpt_config.read(ckpt_config_path) + + # Disable this optimization. + # https://github.com/NVIDIA/FasterTransformer/blob/main/docs/gpt_guide.md#advanced-features + shared_contexts_ratio = 0.0 + + if 'gpt' in ckpt_config.keys(): + head_num = ckpt_config.getint('gpt', 'head_num') + size_per_head = ckpt_config.getint('gpt', 'size_per_head') + vocab_size = ckpt_config.getint('gpt', 'vocab_size') + start_id = ckpt_config.getint('gpt', 'start_id') + end_id = ckpt_config.getint('gpt', 'end_id') + layer_num = ckpt_config.getint('gpt', 'num_layer') + max_seq_len = ckpt_config.getint('gpt', 'max_pos_seq_len') + weights_data_type = ckpt_config.get('gpt', 'weight_data_type') + tensor_para_size = ckpt_config.getint('gpt', 'tensor_para_size') + pipeline_para_size = ckpt_config.getint('gpt', + 'pipeline_para_size', + fallback=1) + layernorm_eps = ckpt_config.getfloat('gpt', + 'layernorm_eps', + fallback=1e-5) + use_attention_linear_bias = ckpt_config.getboolean( + 'gpt', 'use_attention_linear_bias') + has_positional_encoding = ckpt_config.getboolean( + 'gpt', 'has_positional_encoding') + else: + raise RuntimeError( + 'Unexpected config.ini for the FT checkpoint. Expected FT checkpoint to contain the `gpt` key.' + ) + + self.end_id = end_id + + if not comm.is_model_parallel_initailized(): + comm.initialize_model_parallel(tensor_para_size, pipeline_para_size) + + print('Initializing FasterTransformer') + self.model = ParallelGPT( + head_num, + size_per_head, + vocab_size, + start_id, + end_id, + layer_num, + max_seq_len, + tensor_para_size, + pipeline_para_size, + lib_path=ft_lib_path, + inference_data_type=inference_data_type, + int8_mode=int8_mode, + weights_data_type=weights_data_type, + layernorm_eps=layernorm_eps, + use_attention_linear_bias=use_attention_linear_bias, + has_positional_encoding=has_positional_encoding, + shared_contexts_ratio=shared_contexts_ratio) + print(f'Loading FT checkpoint from {model_path}') + if not self.model.load(ckpt_path=model_path): + raise RuntimeError( + 'Could not load model from a FasterTransformer checkpoint') + print('FT initialization complete') + + self.device = comm.get_device() + + def _parse_model_request(self, model_request: Dict) -> Tuple[str, Dict]: + if self.INPUT_KEY not in model_request: + raise RuntimeError( + f'"{self.INPUT_KEY}" must be provided to generate call') + + generate_input = model_request[self.INPUT_KEY] + + # Set default generate kwargs + generate_kwargs = copy.deepcopy(self.INPUT_GENERATE_KWARGS) + # If request contains any additional kwargs, add them to generate_kwargs + for k, v in model_request.get(self.PARAMETERS_KEY, {}).items(): + generate_kwargs[k] = v + + return generate_input, generate_kwargs + + def _map_input_generate_params_to_ft_params(self, + generate_kwargs: Dict) -> Dict: + # Use the default ft args as the base + ft_args = copy.deepcopy(self.FT_GENERATE_ARGS) + + # max_new_tokens is called output_len in FasterTransformer + ft_args['output_len'] = generate_kwargs['max_new_tokens'] + + # top_p, top_k, and temperature map 1:1 + ft_args['top_p'] = generate_kwargs['top_p'] + ft_args['top_k'] = generate_kwargs['top_k'] + ft_args['temperature'] = generate_kwargs['temperature'] + + return ft_args + + def _convert_kwargs(self, generate_inputs: List[str], + generate_kwargs: Dict): + """Converts generate_kwargs into required torch types.""" + batch_size = len(generate_inputs) + + # Allow 'max_length' to be an alias for 'output_len'. Makes it less + # likely clients break when we swap in the FT handler. + if 'max_length' in generate_kwargs: + generate_kwargs['output_len'] = generate_kwargs['max_length'] + del generate_kwargs['max_length'] + + # Integer args may be floats if the values are from a json payload. + generate_kwargs['output_len'] = int(generate_kwargs['output_len']) + generate_kwargs['top_k'] = int(generate_kwargs['top_k']) * torch.ones( + batch_size, dtype=torch.int32) + generate_kwargs['top_p'] *= torch.ones(batch_size, dtype=torch.float32) + generate_kwargs['temperature'] *= torch.ones(batch_size, + dtype=torch.float32) + repetition_penalty = generate_kwargs['repetition_penalty'] + generate_kwargs[ + 'repetition_penalty'] = None if repetition_penalty == 1.0 else repetition_penalty * torch.ones( + batch_size, dtype=torch.float32) + presence_penalty = generate_kwargs['presence_penalty'] + generate_kwargs[ + 'presence_penalty'] = None if presence_penalty == 0.0 else presence_penalty * torch.ones( + batch_size, dtype=torch.float32) + generate_kwargs['beam_search_diversity_rate'] *= torch.ones( + batch_size, dtype=torch.float32) + generate_kwargs['len_penalty'] *= torch.ones(size=[batch_size], + dtype=torch.float32) + generate_kwargs['min_length'] = int( + generate_kwargs['min_length']) * torch.ones(size=[batch_size], + dtype=torch.int32) + if generate_kwargs['random_seed']: + generate_kwargs['random_seed'] = torch.randint(0, + 10000, + size=[batch_size], + dtype=torch.int64) + + def _parse_model_requests( + self, model_requests: List[Dict]) -> Tuple[List[str], Dict]: + """Splits requests into a flat list of inputs and merged kwargs.""" + generate_inputs = [] + generate_kwargs = {} + for req in model_requests: + generate_input, generate_kwarg = self._parse_model_request(req) + generate_inputs += [generate_input] + + # In the case of batched requests, make sure that all requests in the batch + # have the same generate kwargs and if not throw an error + for k, v in generate_kwarg.items(): + if k in generate_kwargs and generate_kwargs[k] != v: + raise RuntimeError( + f'Request has conflicting values for kwarg {k}') + generate_kwargs[k] = v + + return generate_inputs, generate_kwargs + + @torch.no_grad() + def predict(self, model_requests: List[Dict]) -> List[str]: + generate_inputs, input_generate_kwargs = self._parse_model_requests( + model_requests) + # Map our input generate kwargs to the ones FasterTransformer expects + generate_kwargs = self._map_input_generate_params_to_ft_params( + input_generate_kwargs) + self._convert_kwargs(generate_inputs, generate_kwargs) + + start_ids = [ + torch.tensor(self.tokenizer.encode(c), + dtype=torch.int32, + device=self.device) for c in generate_inputs + ] + start_lengths = [len(ids) for ids in start_ids] + start_ids = pad_sequence(start_ids, + batch_first=True, + padding_value=self.end_id) + start_lengths = torch.IntTensor(start_lengths) + tokens_batch = self.model(start_ids, start_lengths, **generate_kwargs) + outputs = [] + for tokens in tokens_batch: + for beam_id in range(generate_kwargs['beam_width']): + # Do not exclude context input from the output + # token = tokens[beam_id][start_lengths[i]:] + token = tokens[beam_id] + # stop at end_id; This is the same as eos_token_id + token = token[token != self.end_id] + output = self.tokenizer.decode(token) + outputs.append(output) + return outputs + + def predict_stream(self, **model_requests: Dict): + raise RuntimeError('Streaming is not supported with FasterTransformer!') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + formatter_class=argparse.RawTextHelpFormatter) + + parser.add_argument( + '--ft_lib_path', + type=str, + required=True, + help= + 'Path to the libth_transformer dynamic lib file(e.g., build/lib/libth_transformer.so.' + ) + parser.add_argument( + '--name_or_dir', + '-i', + type=str, + help= + 'HF hub Model name (e.g., mosaicml/mpt-7b) or local dir path to load checkpoint from', + required=True) + parser.add_argument('--inference_data_type', + '--data_type', + type=str, + choices=['fp32', 'fp16', 'bf16'], + default='bf16') + parser.add_argument( + '--int8_mode', + type=int, + default=0, + choices=[0, 1], + help= + 'The level of quantization to perform. 0: No quantization. All computation in data_type. 1: Quantize weights to int8, all compute occurs in fp16/bf16. Not supported when data_type is fp32' + ) + parser.add_argument('--gpus', + type=int, + default=1, + help='The number of gpus to use for inference.') + + parser.add_argument( + '--force', + action='store_true', + help= + 'Force conversion to FT even if some features may not work as expected in FT' + ) + + args = parser.parse_args() + + s3_path = None + hf_path = None + if 's3' in args.name_or_dir: + s3_path = args.name_or_dir + else: + hf_path = args.name_or_dir + + if not comm.is_model_parallel_initailized(): + # pipeline parallelism is 1 for now + comm.initialize_model_parallel(tensor_para_size=args.gpus, + pipeline_para_size=1) + + if comm.get_rank() == 0: + download_convert(s3_path=s3_path, + hf_path=hf_path, + gpus=args.gpus, + force_conversion=args.force) + if dist.is_initialized(): + dist.barrier() + + model_handle = MPTFTModelHandler(args.name_or_dir, args.ft_lib_path, + args.inference_data_type, args.int8_mode, + args.gpus) + inputs = {'input': 'Who is the president of the USA?'} + out = model_handle.predict([inputs]) + print(out[0]) From 36a57aaaa74abaaf33ac6727b066cdc5c531ed79 Mon Sep 17 00:00:00 2001 From: Ajay Saini Date: Wed, 21 Jun 2023 14:25:35 -0700 Subject: [PATCH 02/12] Hosted handler for MPT (#383) --- .../mpt/mpt_hosted_handler.py | 424 ++++++++++++++++++ 1 file changed, 424 insertions(+) create mode 100644 examples/inference-deployments/mpt/mpt_hosted_handler.py diff --git a/examples/inference-deployments/mpt/mpt_hosted_handler.py b/examples/inference-deployments/mpt/mpt_hosted_handler.py new file mode 100644 index 000000000..8866dd7c8 --- /dev/null +++ b/examples/inference-deployments/mpt/mpt_hosted_handler.py @@ -0,0 +1,424 @@ +# Copyright 2022 MosaicML Examples authors +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import configparser +import copy +import os +from typing import Dict, List, Tuple, Optional +from pathlib import Path +from urllib.parse import urlparse + +import boto3 +import botocore +import torch +import torch.distributed as dist +from FasterTransformer.examples.pytorch.gpt.utils.parallel_gpt import ParallelGPT # yapf: disable # type: ignore +from FasterTransformer.examples.pytorch.gpt.utils import comm # yapf: disable # type: ignore +from scripts.inference.convert_hf_mpt_to_ft import convert_mpt_to_ft # yapf: disable # type: ignore +from torch.nn.utils.rnn import pad_sequence +from transformers import AutoTokenizer +from huggingface_hub import snapshot_download + +LOCAL_CHECKPOINT_DIR = '/tmp/mpt' +LOCAL_MODEL_PATH = os.path.join(LOCAL_CHECKPOINT_DIR, 'local_model') + + +def download_convert(s3_path: Optional[str] = None, + hf_path: Optional[str] = None, + gpus: int = 1, + force_conversion: bool = False): + """Download model and convert to FasterTransformer format. + + Args: + s3_path (str): Path for model location in an s3 bucket. + hf_path (str): Name of the model as on HF hub (e.g., mosaicml/mpt-7b-instruct) or local folder name containing + the model (e.g., mpt-7b-instruct) + gpus (int): Number of gpus to use for inference (Default: 1) + force_conversion (bool): Force conversion to FT even if some features may not work as expected in FT (Default: False) + """ + if not s3_path and not hf_path: + raise RuntimeError( + 'Either s3_path or hf_path must be provided to download_convert') + model_name_or_path: str = '' + if s3_path: + # s3 creds need to already be present as env vars + s3 = boto3.client('s3') + model_name_or_path = LOCAL_MODEL_PATH + + # Download model files + if os.path.exists(LOCAL_MODEL_PATH): + print( + f'[+] Path {LOCAL_MODEL_PATH} already exists, skipping download' + ) + else: + Path(LOCAL_MODEL_PATH).mkdir(parents=True, exist_ok=True) + + print(f'Downloading model from path: {s3_path}') + + parsed_path = urlparse(s3_path) + + objs = s3.list_objects_v2( + Bucket=parsed_path.netloc, + Prefix=parsed_path.path.lstrip('/'), + ) + for obj in objs['Contents']: + file_key = obj['Key'] + try: + file_name = os.path.basename(file_key) + s3.download_file(Bucket=parsed_path.netloc, + Key=file_key, + Filename=os.path.join( + LOCAL_MODEL_PATH, file_name)) + except botocore.exceptions.ClientError as e: + print( + f'Error downloading file with key: {file_key} with error: {e}' + ) + elif hf_path: + print(f'Downloading HF model with name: {hf_path}') + model_name_or_path = hf_path + snapshot_download(repo_id=hf_path) + + # This is the format the the conversion script saves the converted checkpoint in + local_ft_model_path = os.path.join(LOCAL_CHECKPOINT_DIR, f'{gpus}-gpu') + ckpt_config_path = os.path.join(local_ft_model_path, 'config.ini') + + # Convert model to FT format + # If FT checkpoint doesn't exist, create it. + if not os.path.isfile(ckpt_config_path): + print('Converting model to FT format') + # Datatype of weights in the HF checkpoint + weight_data_type = 'fp32' + convert_mpt_to_ft(model_name_or_path, LOCAL_CHECKPOINT_DIR, gpus, + weight_data_type, force_conversion) + if not os.path.isfile(ckpt_config_path): + raise RuntimeError('Failed to create FT checkpoint') + else: + print(f'Reusing existing FT checkpoint at {local_ft_model_path}') + + +class MPTFTHostedModelHandler: + # This is what the user request will contain + INPUT_GENERATE_KWARGS = { + 'max_new_tokens': 256, + 'top_p': 0.95, + 'top_k': 50, + 'temperature': 0.8, + } + + # These are the args we need to map the user request to before running generate() + # with FasterTransformer + FT_GENERATE_KWARGS = { + # Output sequence length to generate. + 'output_len': 256, + # Beam width for beam search + 'beam_width': 1, + # top k candidate number + 'top_k': 50, + # top p probability threshold + 'top_p': 0.95, + # temperature parameter + 'temperature': 0.8, + # Penalty for repetitions + 'repetition_penalty': 1.0, + # Presence penalty. Similar to repetition, but additive rather than multiplicative. + 'presence_penalty': 0.0, + 'beam_search_diversity_rate': 0.0, + 'len_penalty': 0.0, + 'bad_words_list': None, + # A minimum number of tokens to generate. + 'min_length': 0, + # if True, use different random seed for sentences in a batch. + 'random_seed': True + } + + INPUT_KEY = 'inputs' + PARAMETERS_KEY = 'parameters' + + def __init__(self, + model_name_or_path: str, + ft_lib_path: str, + inference_data_type: str = 'bf16', + int8_mode: int = 0, + gpus: int = 1): + """Fastertransformer model handler for MPT foundation series. + + Args: + model_name_or_path (str): Name of the model as on HF hub (e.g., mosaicml/mpt-7b-instruct) or local model name (e.g., mpt-7b-instruct) + ft_lib_path (str): Path to the libth_transformer dynamic lib file(.e.g., build/lib/libth_transformer.so). + inference_data_type (str): Data type to use for inference (Default: bf16) + int8_mode (int): The level of quantization to perform. 0: No quantization. All computation in data_type, + 1: Quantize weights to int8, all compute occurs in fp16/bf16. Not supported when data_type is fp32 + gpus (int): Number of gpus to use for inference (Default: 1) + """ + self.model_name_or_path = model_name_or_path + + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, + trust_remote_code=True) + + # Make sure the seed on all ranks is the same. This is important. + # Multi-gpu generate calls will hang without this. + torch.manual_seed(0) + + model_path = os.path.join(LOCAL_CHECKPOINT_DIR, f'{gpus}-gpu') + ckpt_config_path = os.path.join(model_path, 'config.ini') + + ckpt_config = configparser.ConfigParser() + ckpt_config.read(ckpt_config_path) + + # Disable this optimization. + # https://github.com/NVIDIA/FasterTransformer/blob/main/docs/gpt_guide.md#advanced-features + shared_contexts_ratio = 0.0 + + if 'gpt' in ckpt_config.keys(): + head_num = ckpt_config.getint('gpt', 'head_num') + size_per_head = ckpt_config.getint('gpt', 'size_per_head') + vocab_size = ckpt_config.getint('gpt', 'vocab_size') + start_id = ckpt_config.getint('gpt', 'start_id') + end_id = ckpt_config.getint('gpt', 'end_id') + layer_num = ckpt_config.getint('gpt', 'num_layer') + max_seq_len = ckpt_config.getint('gpt', 'max_pos_seq_len') + weights_data_type = ckpt_config.get('gpt', 'weight_data_type') + tensor_para_size = ckpt_config.getint('gpt', 'tensor_para_size') + pipeline_para_size = ckpt_config.getint('gpt', + 'pipeline_para_size', + fallback=1) + layernorm_eps = ckpt_config.getfloat('gpt', + 'layernorm_eps', + fallback=1e-5) + use_attention_linear_bias = ckpt_config.getboolean( + 'gpt', 'use_attention_linear_bias') + has_positional_encoding = ckpt_config.getboolean( + 'gpt', 'has_positional_encoding') + else: + raise RuntimeError( + 'Unexpected config.ini for the FT checkpoint. Expected FT checkpoint to contain the `gpt` key.' + ) + + self.end_id = end_id + + if not comm.is_model_parallel_initailized(): + comm.initialize_model_parallel(tensor_para_size, pipeline_para_size) + + print('Initializing FasterTransformer') + self.model = ParallelGPT( + head_num, + size_per_head, + vocab_size, + start_id, + end_id, + layer_num, + max_seq_len, + tensor_para_size, + pipeline_para_size, + lib_path=ft_lib_path, + inference_data_type=inference_data_type, + int8_mode=int8_mode, + weights_data_type=weights_data_type, + layernorm_eps=layernorm_eps, + use_attention_linear_bias=use_attention_linear_bias, + has_positional_encoding=has_positional_encoding, + shared_contexts_ratio=shared_contexts_ratio) + print(f'Loading FT checkpoint from {model_path}') + if not self.model.load(ckpt_path=model_path): + raise RuntimeError( + 'Could not load model from a FasterTransformer checkpoint') + print('FT initialization complete') + + self.device = comm.get_device() + + def _parse_model_request(self, model_request: Dict) -> Tuple[str, Dict]: + if self.INPUT_KEY not in model_request: + raise RuntimeError( + f'"{self.INPUT_KEY}" must be provided to generate call') + + generate_input = model_request[self.INPUT_KEY] + + # Set default generate kwargs + generate_kwargs = copy.deepcopy(self.INPUT_GENERATE_KWARGS) + # If request contains any additional kwargs, add them to generate_kwargs + for k, v in model_request.get(self.PARAMETERS_KEY, {}).items(): + generate_kwargs[k] = v + + return generate_input, generate_kwargs + + def _map_input_generate_params_to_ft_params(self, + generate_kwargs: Dict) -> Dict: + # Use the default ft args as the base + ft_args = copy.deepcopy(self.FT_GENERATE_ARGS) + + # max_new_tokens is called output_len in FasterTransformer + ft_args['output_len'] = generate_kwargs['max_new_tokens'] + + # top_p, top_k, and temperature map 1:1 + ft_args['top_p'] = generate_kwargs['top_p'] + ft_args['top_k'] = generate_kwargs['top_k'] + ft_args['temperature'] = generate_kwargs['temperature'] + + return ft_args + + def _convert_kwargs(self, generate_inputs: List[str], + generate_kwargs: Dict): + """Converts generate_kwargs into required torch types.""" + batch_size = len(generate_inputs) + + # Allow 'max_length' to be an alias for 'output_len'. Makes it less + # likely clients break when we swap in the FT handler. + if 'max_length' in generate_kwargs: + generate_kwargs['output_len'] = generate_kwargs['max_length'] + del generate_kwargs['max_length'] + + # Integer args may be floats if the values are from a json payload. + generate_kwargs['output_len'] = int(generate_kwargs['output_len']) + generate_kwargs['top_k'] = int(generate_kwargs['top_k']) * torch.ones( + batch_size, dtype=torch.int32) + generate_kwargs['top_p'] *= torch.ones(batch_size, dtype=torch.float32) + generate_kwargs['temperature'] *= torch.ones(batch_size, + dtype=torch.float32) + repetition_penalty = generate_kwargs['repetition_penalty'] + generate_kwargs[ + 'repetition_penalty'] = None if repetition_penalty == 1.0 else repetition_penalty * torch.ones( + batch_size, dtype=torch.float32) + presence_penalty = generate_kwargs['presence_penalty'] + generate_kwargs[ + 'presence_penalty'] = None if presence_penalty == 0.0 else presence_penalty * torch.ones( + batch_size, dtype=torch.float32) + generate_kwargs['beam_search_diversity_rate'] *= torch.ones( + batch_size, dtype=torch.float32) + generate_kwargs['len_penalty'] *= torch.ones(size=[batch_size], + dtype=torch.float32) + generate_kwargs['min_length'] = int( + generate_kwargs['min_length']) * torch.ones(size=[batch_size], + dtype=torch.int32) + if generate_kwargs['random_seed']: + generate_kwargs['random_seed'] = torch.randint(0, + 10000, + size=[batch_size], + dtype=torch.int64) + + def _parse_model_requests( + self, model_requests: List[Dict]) -> Tuple[List[str], Dict]: + """Splits requests into a flat list of inputs and merged kwargs.""" + generate_inputs = [] + generate_kwargs = {} + for req in model_requests: + generate_input, generate_kwarg = self._parse_model_request(req) + generate_inputs += [generate_input] + + # In the case of batched requests, make sure that all requests in the batch + # have the same generate kwargs and if not throw an error + for k, v in generate_kwarg.items(): + if k in generate_kwargs and generate_kwargs[k] != v: + raise RuntimeError( + f'Request has conflicting values for kwarg {k}') + generate_kwargs[k] = v + + return generate_inputs, generate_kwargs + + @torch.no_grad() + def predict(self, model_requests: List[Dict]) -> List[str]: + generate_inputs, input_generate_kwargs = self._parse_model_requests( + model_requests) + # Map our input generate kwargs to the ones FasterTransformer expects + generate_kwargs = self._map_input_generate_params_to_ft_params( + input_generate_kwargs) + self._convert_kwargs(generate_inputs, generate_kwargs) + + start_ids = [ + torch.tensor(self.tokenizer.encode(c), + dtype=torch.int32, + device=self.device) for c in generate_inputs + ] + start_lengths = [len(ids) for ids in start_ids] + start_ids = pad_sequence(start_ids, + batch_first=True, + padding_value=self.end_id) + start_lengths = torch.IntTensor(start_lengths) + tokens_batch = self.model(start_ids, start_lengths, **generate_kwargs) + outputs = [] + for tokens in tokens_batch: + for beam_id in range(generate_kwargs['beam_width']): + # Do not exclude context input from the output + # token = tokens[beam_id][start_lengths[i]:] + token = tokens[beam_id] + # stop at end_id; This is the same as eos_token_id + token = token[token != self.end_id] + output = self.tokenizer.decode(token) + outputs.append(output) + return outputs + + def predict_stream(self, **model_requests: Dict): + raise RuntimeError('Streaming is not supported with FasterTransformer!') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + formatter_class=argparse.RawTextHelpFormatter) + + parser.add_argument( + '--ft_lib_path', + type=str, + required=True, + help= + 'Path to the libth_transformer dynamic lib file(e.g., build/lib/libth_transformer.so.' + ) + parser.add_argument( + '--name_or_dir', + '-i', + type=str, + help= + 'HF hub Model name (e.g., mosaicml/mpt-7b) or local dir path to load checkpoint from', + required=True) + parser.add_argument('--inference_data_type', + '--data_type', + type=str, + choices=['fp32', 'fp16', 'bf16'], + default='bf16') + parser.add_argument( + '--int8_mode', + type=int, + default=0, + choices=[0, 1], + help= + 'The level of quantization to perform. 0: No quantization. All computation in data_type. 1: Quantize weights to int8, all compute occurs in fp16/bf16. Not supported when data_type is fp32' + ) + parser.add_argument('--gpus', + type=int, + default=1, + help='The number of gpus to use for inference.') + + parser.add_argument( + '--force', + action='store_true', + help= + 'Force conversion to FT even if some features may not work as expected in FT' + ) + + args = parser.parse_args() + + s3_path = None + hf_path = None + if 's3' in args.name_or_dir: + s3_path = args.name_or_dir + else: + hf_path = args.name_or_dir + + if not comm.is_model_parallel_initailized(): + # pipeline parallelism is 1 for now + comm.initialize_model_parallel(tensor_para_size=args.gpus, + pipeline_para_size=1) + + if comm.get_rank() == 0: + download_convert(s3_path=s3_path, + hf_path=hf_path, + gpus=args.gpus, + force_conversion=args.force) + if dist.is_initialized(): + dist.barrier() + + model_handle = MPTFTModelHandler(args.name_or_dir, args.ft_lib_path, + args.inference_data_type, args.int8_mode, + args.gpus) + inputs = {'input': 'Who is the president of the USA?'} + out = model_handle.predict([inputs]) + print(out[0]) From 1d8baafa1c9ce87ec3bff9e355b59bf7dd5f5827 Mon Sep 17 00:00:00 2001 From: Ajay Saini Date: Wed, 21 Jun 2023 15:12:56 -0700 Subject: [PATCH 03/12] Use max_tokens --- examples/inference-deployments/mpt/mpt_hosted_handler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/inference-deployments/mpt/mpt_hosted_handler.py b/examples/inference-deployments/mpt/mpt_hosted_handler.py index 8866dd7c8..e78679853 100644 --- a/examples/inference-deployments/mpt/mpt_hosted_handler.py +++ b/examples/inference-deployments/mpt/mpt_hosted_handler.py @@ -100,7 +100,7 @@ def download_convert(s3_path: Optional[str] = None, class MPTFTHostedModelHandler: # This is what the user request will contain INPUT_GENERATE_KWARGS = { - 'max_new_tokens': 256, + 'max_tokens': 256, 'top_p': 0.95, 'top_k': 50, 'temperature': 0.8, @@ -247,8 +247,8 @@ def _map_input_generate_params_to_ft_params(self, # Use the default ft args as the base ft_args = copy.deepcopy(self.FT_GENERATE_ARGS) - # max_new_tokens is called output_len in FasterTransformer - ft_args['output_len'] = generate_kwargs['max_new_tokens'] + # max_tokens is called output_len in FasterTransformer + ft_args['output_len'] = generate_kwargs['max_tokens'] # top_p, top_k, and temperature map 1:1 ft_args['top_p'] = generate_kwargs['top_p'] From 8a16f8d6f4fa95ba23969fa8dffd88d7ce393455 Mon Sep 17 00:00:00 2001 From: Ajay Saini Date: Wed, 21 Jun 2023 16:17:51 -0700 Subject: [PATCH 04/12] Naming --- examples/inference-deployments/mpt/mpt_hosted_handler.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/inference-deployments/mpt/mpt_hosted_handler.py b/examples/inference-deployments/mpt/mpt_hosted_handler.py index e78679853..61bd6ef40 100644 --- a/examples/inference-deployments/mpt/mpt_hosted_handler.py +++ b/examples/inference-deployments/mpt/mpt_hosted_handler.py @@ -100,7 +100,7 @@ def download_convert(s3_path: Optional[str] = None, class MPTFTHostedModelHandler: # This is what the user request will contain INPUT_GENERATE_KWARGS = { - 'max_tokens': 256, + 'max_new_tokens': 256, 'top_p': 0.95, 'top_k': 50, 'temperature': 0.8, @@ -132,7 +132,7 @@ class MPTFTHostedModelHandler: 'random_seed': True } - INPUT_KEY = 'inputs' + INPUT_KEY = 'input' PARAMETERS_KEY = 'parameters' def __init__(self, @@ -247,8 +247,8 @@ def _map_input_generate_params_to_ft_params(self, # Use the default ft args as the base ft_args = copy.deepcopy(self.FT_GENERATE_ARGS) - # max_tokens is called output_len in FasterTransformer - ft_args['output_len'] = generate_kwargs['max_tokens'] + # max_new_tokens is called output_len in FasterTransformer + ft_args['output_len'] = generate_kwargs['max_new_tokens'] # top_p, top_k, and temperature map 1:1 ft_args['top_p'] = generate_kwargs['top_p'] From 425b1e5faff5d9c46f65a182b79b48b745998cc7 Mon Sep 17 00:00:00 2001 From: Ajay Saini Date: Wed, 21 Jun 2023 16:19:42 -0700 Subject: [PATCH 05/12] Input naming (#385) --- examples/inference-deployments/mpt/mpt_hosted_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inference-deployments/mpt/mpt_hosted_handler.py b/examples/inference-deployments/mpt/mpt_hosted_handler.py index 8866dd7c8..61bd6ef40 100644 --- a/examples/inference-deployments/mpt/mpt_hosted_handler.py +++ b/examples/inference-deployments/mpt/mpt_hosted_handler.py @@ -132,7 +132,7 @@ class MPTFTHostedModelHandler: 'random_seed': True } - INPUT_KEY = 'inputs' + INPUT_KEY = 'input' PARAMETERS_KEY = 'parameters' def __init__(self, From 8d3fcebf9e1d54054114089e8d832bbb5b6c7507 Mon Sep 17 00:00:00 2001 From: Ajay Saini Date: Wed, 21 Jun 2023 16:40:17 -0700 Subject: [PATCH 06/12] Args naming --- examples/inference-deployments/mpt/mpt_hosted_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inference-deployments/mpt/mpt_hosted_handler.py b/examples/inference-deployments/mpt/mpt_hosted_handler.py index 61bd6ef40..88809dd72 100644 --- a/examples/inference-deployments/mpt/mpt_hosted_handler.py +++ b/examples/inference-deployments/mpt/mpt_hosted_handler.py @@ -245,7 +245,7 @@ def _parse_model_request(self, model_request: Dict) -> Tuple[str, Dict]: def _map_input_generate_params_to_ft_params(self, generate_kwargs: Dict) -> Dict: # Use the default ft args as the base - ft_args = copy.deepcopy(self.FT_GENERATE_ARGS) + ft_args = copy.deepcopy(self.FT_GENERATE_KWARGS) # max_new_tokens is called output_len in FasterTransformer ft_args['output_len'] = generate_kwargs['max_new_tokens'] From 70d91518322d01e4c316eedb72848fe2461cef71 Mon Sep 17 00:00:00 2001 From: Ajay Saini Date: Wed, 21 Jun 2023 16:41:11 -0700 Subject: [PATCH 07/12] Fixed FT args naming (#387) --- examples/inference-deployments/mpt/mpt_hosted_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inference-deployments/mpt/mpt_hosted_handler.py b/examples/inference-deployments/mpt/mpt_hosted_handler.py index 61bd6ef40..88809dd72 100644 --- a/examples/inference-deployments/mpt/mpt_hosted_handler.py +++ b/examples/inference-deployments/mpt/mpt_hosted_handler.py @@ -245,7 +245,7 @@ def _parse_model_request(self, model_request: Dict) -> Tuple[str, Dict]: def _map_input_generate_params_to_ft_params(self, generate_kwargs: Dict) -> Dict: # Use the default ft args as the base - ft_args = copy.deepcopy(self.FT_GENERATE_ARGS) + ft_args = copy.deepcopy(self.FT_GENERATE_KWARGS) # max_new_tokens is called output_len in FasterTransformer ft_args['output_len'] = generate_kwargs['max_new_tokens'] From eebef2c5745a9b3e4ed14394f0af90ca6f9aaa3a Mon Sep 17 00:00:00 2001 From: Ajay Saini Date: Wed, 21 Jun 2023 19:34:54 -0700 Subject: [PATCH 08/12] Exclude input from output (#388) * Enable CodeQL for pull requests (#374) This reverts commit 1a0492376cb1fc0b83e6e5db1e6edbb06c60c175. * Update --------- Co-authored-by: bandish-shah <86627118+bandish-shah@users.noreply.github.com> --- .github/workflows/codeql-analysis.yml | 3 +++ .../inference-deployments/mpt/mpt_7b_ft_handler.py | 11 +++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index fbeef28d2..f3ecf9a1f 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -14,6 +14,9 @@ name: "CodeQL" on: push: branches: [ main ] + pull_request: + # The branches below must be a subset of the branches above + branches: [ main ] schedule: - cron: '0 9 * * 1' # Every Monday at 09:00 (9:00 AM) diff --git a/examples/inference-deployments/mpt/mpt_7b_ft_handler.py b/examples/inference-deployments/mpt/mpt_7b_ft_handler.py index 0f2621c5b..36fb88a50 100644 --- a/examples/inference-deployments/mpt/mpt_7b_ft_handler.py +++ b/examples/inference-deployments/mpt/mpt_7b_ft_handler.py @@ -307,11 +307,14 @@ def predict(self, model_requests: List[Dict]) -> List[str]: start_lengths = torch.IntTensor(start_lengths) tokens_batch = self.model(start_ids, start_lengths, **generate_kwargs) outputs = [] - for tokens in tokens_batch: + for i, tokens in enumerate(tokens_batch): for beam_id in range(generate_kwargs['beam_width']): - # Do not exclude context input from the output - # token = tokens[beam_id][start_lengths[i]:] - token = tokens[beam_id] + # Exclude context input from the output + token = tokens[beam_id][start_lengths[i]:] + + # Do this to exclude context input from the output + # token = tokens[beam_id] + # stop at end_id; This is the same as eos_token_id token = token[token != self.end_id] output = self.tokenizer.decode(token) From 6af4013893c7f2d4d1d090f493d5becae3d1a7f9 Mon Sep 17 00:00:00 2001 From: Margaret Qian Date: Wed, 21 Jun 2023 19:47:25 -0700 Subject: [PATCH 09/12] Add skip_special_tokens (#386) --- examples/inference-deployments/mpt/mpt_hosted_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inference-deployments/mpt/mpt_hosted_handler.py b/examples/inference-deployments/mpt/mpt_hosted_handler.py index 88809dd72..fcb3144fe 100644 --- a/examples/inference-deployments/mpt/mpt_hosted_handler.py +++ b/examples/inference-deployments/mpt/mpt_hosted_handler.py @@ -343,7 +343,7 @@ def predict(self, model_requests: List[Dict]) -> List[str]: token = tokens[beam_id] # stop at end_id; This is the same as eos_token_id token = token[token != self.end_id] - output = self.tokenizer.decode(token) + output = self.tokenizer.decode(token, skip_special_tokens=True) outputs.append(output) return outputs From dceba1ad8c05582bc9e4853ec69623b558b26aed Mon Sep 17 00:00:00 2001 From: Ajay Saini Date: Wed, 21 Jun 2023 19:55:32 -0700 Subject: [PATCH 10/12] Exclude input tokens --- .../inference-deployments/mpt/mpt_hosted_handler.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/examples/inference-deployments/mpt/mpt_hosted_handler.py b/examples/inference-deployments/mpt/mpt_hosted_handler.py index fcb3144fe..ef2181859 100644 --- a/examples/inference-deployments/mpt/mpt_hosted_handler.py +++ b/examples/inference-deployments/mpt/mpt_hosted_handler.py @@ -336,11 +336,14 @@ def predict(self, model_requests: List[Dict]) -> List[str]: start_lengths = torch.IntTensor(start_lengths) tokens_batch = self.model(start_ids, start_lengths, **generate_kwargs) outputs = [] - for tokens in tokens_batch: + for i, tokens in enumerate(tokens_batch): for beam_id in range(generate_kwargs['beam_width']): - # Do not exclude context input from the output - # token = tokens[beam_id][start_lengths[i]:] - token = tokens[beam_id] + # Exclude context input from the output + token = tokens[beam_id][start_lengths[i]:] + + # Do this to exclude context input from the output + # token = tokens[beam_id] + # stop at end_id; This is the same as eos_token_id token = token[token != self.end_id] output = self.tokenizer.decode(token, skip_special_tokens=True) From c98b5ce713645d96c2663a53ff879ace66312dbc Mon Sep 17 00:00:00 2001 From: Ajay Saini Date: Wed, 21 Jun 2023 19:58:14 -0700 Subject: [PATCH 11/12] Exclude input tokens in output (#389) --- .../inference-deployments/mpt/mpt_hosted_handler.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/examples/inference-deployments/mpt/mpt_hosted_handler.py b/examples/inference-deployments/mpt/mpt_hosted_handler.py index fcb3144fe..ef2181859 100644 --- a/examples/inference-deployments/mpt/mpt_hosted_handler.py +++ b/examples/inference-deployments/mpt/mpt_hosted_handler.py @@ -336,11 +336,14 @@ def predict(self, model_requests: List[Dict]) -> List[str]: start_lengths = torch.IntTensor(start_lengths) tokens_batch = self.model(start_ids, start_lengths, **generate_kwargs) outputs = [] - for tokens in tokens_batch: + for i, tokens in enumerate(tokens_batch): for beam_id in range(generate_kwargs['beam_width']): - # Do not exclude context input from the output - # token = tokens[beam_id][start_lengths[i]:] - token = tokens[beam_id] + # Exclude context input from the output + token = tokens[beam_id][start_lengths[i]:] + + # Do this to exclude context input from the output + # token = tokens[beam_id] + # stop at end_id; This is the same as eos_token_id token = token[token != self.end_id] output = self.tokenizer.decode(token, skip_special_tokens=True) From 101552d5ba9a2f622c17e1073788b2b0200d9e38 Mon Sep 17 00:00:00 2001 From: Ajay Saini Date: Wed, 21 Jun 2023 20:28:07 -0700 Subject: [PATCH 12/12] Updated --- .../inference-deployments/mpt/mpt_hosted_handler.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/examples/inference-deployments/mpt/mpt_hosted_handler.py b/examples/inference-deployments/mpt/mpt_hosted_handler.py index ef2181859..a9602394e 100644 --- a/examples/inference-deployments/mpt/mpt_hosted_handler.py +++ b/examples/inference-deployments/mpt/mpt_hosted_handler.py @@ -140,7 +140,8 @@ def __init__(self, ft_lib_path: str, inference_data_type: str = 'bf16', int8_mode: int = 0, - gpus: int = 1): + gpus: int = 1, + exclude_input_from_output: bool = False): """Fastertransformer model handler for MPT foundation series. Args: @@ -150,6 +151,7 @@ def __init__(self, int8_mode (int): The level of quantization to perform. 0: No quantization. All computation in data_type, 1: Quantize weights to int8, all compute occurs in fp16/bf16. Not supported when data_type is fp32 gpus (int): Number of gpus to use for inference (Default: 1) + exclude_input_from_output (bool): True to exclude input from the model output, false otherwise. """ self.model_name_or_path = model_name_or_path @@ -227,6 +229,8 @@ def __init__(self, self.device = comm.get_device() + self.exclude_input_from_output = exclude_input_from_output + def _parse_model_request(self, model_request: Dict) -> Tuple[str, Dict]: if self.INPUT_KEY not in model_request: raise RuntimeError( @@ -338,11 +342,10 @@ def predict(self, model_requests: List[Dict]) -> List[str]: outputs = [] for i, tokens in enumerate(tokens_batch): for beam_id in range(generate_kwargs['beam_width']): + token = tokens[beam_id] # Exclude context input from the output - token = tokens[beam_id][start_lengths[i]:] - - # Do this to exclude context input from the output - # token = tokens[beam_id] + if self.exclude_input_from_output: + token = token[start_lengths[i]:] # stop at end_id; This is the same as eos_token_id token = token[token != self.end_id]