|  | 
|  | 1 | +#pragma once | 
|  | 2 | + | 
|  | 3 | +#include <string> | 
|  | 4 | +#include <vector> | 
|  | 5 | +#include <set> | 
|  | 6 | +#include <mutex> | 
|  | 7 | +#include <condition_variable> | 
|  | 8 | +#include <unordered_map> | 
|  | 9 | + | 
|  | 10 | +#include "json.hpp" | 
|  | 11 | +#include "utils.hpp" | 
|  | 12 | + | 
|  | 13 | +#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" | 
|  | 14 | + | 
|  | 15 | +using json = nlohmann::json; | 
|  | 16 | + | 
|  | 17 | +inline static json oaicompat_completion_params_parse( | 
|  | 18 | +    const json &body /* openai api json semantics */) | 
|  | 19 | +{ | 
|  | 20 | +    json llama_params; | 
|  | 21 | + | 
|  | 22 | +    llama_params["__oaicompat"] = true; | 
|  | 23 | + | 
|  | 24 | +    // Map OpenAI parameters to llama.cpp parameters | 
|  | 25 | +    // | 
|  | 26 | +    // For parameters that are defined by the OpenAI documentation (e.g. | 
|  | 27 | +    // temperature), we explicitly specify OpenAI's intended default; we | 
|  | 28 | +    // need to do that because sometimes OpenAI disagrees with llama.cpp | 
|  | 29 | +    // | 
|  | 30 | +    // https://platform.openai.com/docs/api-reference/chat/create | 
|  | 31 | +    llama_sampling_params default_sparams; | 
|  | 32 | +    llama_params["model"]             = json_value(body, "model", std::string("unknown")); | 
|  | 33 | +    llama_params["prompt"]            = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt' | 
|  | 34 | +    llama_params["cache_prompt"]      = json_value(body, "cache_prompt", false); | 
|  | 35 | +    llama_params["temperature"]       = json_value(body, "temperature", 0.0); | 
|  | 36 | +    llama_params["top_k"]             = json_value(body, "top_k", default_sparams.top_k); | 
|  | 37 | +    llama_params["top_p"]             = json_value(body, "top_p", 1.0); | 
|  | 38 | +    llama_params["n_predict"]         = json_value(body, "max_tokens", -1); | 
|  | 39 | +    llama_params["logit_bias"]        = json_value(body, "logit_bias",json::object()); | 
|  | 40 | +    llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0); | 
|  | 41 | +    llama_params["presence_penalty"]  = json_value(body, "presence_penalty", 0.0); | 
|  | 42 | +    llama_params["seed"]              = json_value(body, "seed", LLAMA_DEFAULT_SEED); | 
|  | 43 | +    llama_params["stream"]            = json_value(body, "stream", false); | 
|  | 44 | +    llama_params["mirostat"]          = json_value(body, "mirostat", default_sparams.mirostat); | 
|  | 45 | +    llama_params["mirostat_tau"]      = json_value(body, "mirostat_tau", default_sparams.mirostat_tau); | 
|  | 46 | +    llama_params["mirostat_eta"]      = json_value(body, "mirostat_eta", default_sparams.mirostat_eta); | 
|  | 47 | +    llama_params["penalize_nl"]       = json_value(body, "penalize_nl", default_sparams.penalize_nl); | 
|  | 48 | +    llama_params["typical_p"]         = json_value(body, "typical_p", default_sparams.typical_p); | 
|  | 49 | +    llama_params["repeat_last_n"]     = json_value(body, "repeat_last_n", default_sparams.penalty_last_n); | 
|  | 50 | +    llama_params["ignore_eos"]        = json_value(body, "ignore_eos", false); | 
|  | 51 | +    llama_params["tfs_z"]             = json_value(body, "tfs_z", default_sparams.tfs_z); | 
|  | 52 | + | 
|  | 53 | +    if (body.count("grammar") != 0) { | 
|  | 54 | +        llama_params["grammar"] = json_value(body, "grammar", json::object()); | 
|  | 55 | +    } | 
|  | 56 | + | 
|  | 57 | +    // Handle 'stop' field | 
|  | 58 | +    if (body.contains("stop") && body["stop"].is_string()) { | 
|  | 59 | +        llama_params["stop"] = json::array({body["stop"].get<std::string>()}); | 
|  | 60 | +    } else { | 
|  | 61 | +        llama_params["stop"] = json_value(body, "stop", json::array()); | 
|  | 62 | +    } | 
|  | 63 | + | 
|  | 64 | +    // Ensure there is ChatML-specific end sequence among stop words | 
|  | 65 | +    llama_params["stop"].push_back("<|im_end|>"); | 
|  | 66 | + | 
|  | 67 | +    return llama_params; | 
|  | 68 | +} | 
|  | 69 | + | 
|  | 70 | +inline static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false) | 
|  | 71 | +{ | 
|  | 72 | +    json result = response.result_json; | 
|  | 73 | + | 
|  | 74 | +    bool stopped_word        = result.count("stopped_word") != 0; | 
|  | 75 | +    bool stopped_eos         = json_value(result, "stopped_eos", false); | 
|  | 76 | +    int num_tokens_predicted = json_value(result, "tokens_predicted", 0); | 
|  | 77 | +    int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0); | 
|  | 78 | +    std::string content      = json_value(result, "content", std::string("")); | 
|  | 79 | + | 
|  | 80 | +    std::string finish_reason = "length"; | 
|  | 81 | +    if (stopped_word || stopped_eos) { | 
|  | 82 | +        finish_reason = "stop"; | 
|  | 83 | +    } | 
|  | 84 | + | 
|  | 85 | +    json choices = | 
|  | 86 | +        streaming ? json::array({json{{"finish_reason", finish_reason}, | 
|  | 87 | +                                        {"index", 0}, | 
|  | 88 | +                                        {"delta", json::object()}}}) | 
|  | 89 | +                  : json::array({json{{"finish_reason", finish_reason}, | 
|  | 90 | +                                        {"index", 0}, | 
|  | 91 | +                                        {"message", json{{"content", content}, | 
|  | 92 | +                                                         {"role", "assistant"}}}}}); | 
|  | 93 | + | 
|  | 94 | +    std::time_t t = std::time(0); | 
|  | 95 | + | 
|  | 96 | +    json res = | 
|  | 97 | +        json{{"choices", choices}, | 
|  | 98 | +            {"created", t}, | 
|  | 99 | +            {"model", | 
|  | 100 | +                json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, | 
|  | 101 | +            {"object", streaming ? "chat.completion.chunk" : "chat.completion"}, | 
|  | 102 | +            {"usage", | 
|  | 103 | +                json{{"completion_tokens", num_tokens_predicted}, | 
|  | 104 | +                     {"prompt_tokens",     num_prompt_tokens}, | 
|  | 105 | +                     {"total_tokens",      num_tokens_predicted + num_prompt_tokens}}}, | 
|  | 106 | +            {"id", gen_chatcmplid()}}; | 
|  | 107 | + | 
|  | 108 | +    if (server_verbose) { | 
|  | 109 | +        res["__verbose"] = result; | 
|  | 110 | +    } | 
|  | 111 | + | 
|  | 112 | +    if (result.contains("completion_probabilities")) { | 
|  | 113 | +        res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array()); | 
|  | 114 | +    } | 
|  | 115 | + | 
|  | 116 | +    return res; | 
|  | 117 | +} | 
|  | 118 | + | 
|  | 119 | +// return value is vector as there is one case where we might need to generate two responses | 
|  | 120 | +inline static std::vector<json> format_partial_response_oaicompat(const task_result &response) { | 
|  | 121 | +    json result = response.result_json; | 
|  | 122 | + | 
|  | 123 | +    if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) { | 
|  | 124 | +        return std::vector<json>({response.result_json}); | 
|  | 125 | +    } | 
|  | 126 | + | 
|  | 127 | +    bool first = json_value(result, "oaicompat_token_ctr", 0) == 0; | 
|  | 128 | +    std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); | 
|  | 129 | + | 
|  | 130 | +    bool stopped_word   = json_value(result, "stopped_word", false); | 
|  | 131 | +    bool stopped_eos    = json_value(result, "stopped_eos", false); | 
|  | 132 | +    bool stopped_limit  = json_value(result, "stopped_limit", false); | 
|  | 133 | +    std::string content = json_value(result, "content", std::string("")); | 
|  | 134 | + | 
|  | 135 | +    std::string finish_reason; | 
|  | 136 | +    if (stopped_word || stopped_eos) { | 
|  | 137 | +        finish_reason = "stop"; | 
|  | 138 | +    } | 
|  | 139 | +    if (stopped_limit) { | 
|  | 140 | +        finish_reason = "length"; | 
|  | 141 | +    } | 
|  | 142 | + | 
|  | 143 | +    std::time_t t = std::time(0); | 
|  | 144 | + | 
|  | 145 | +    json choices; | 
|  | 146 | + | 
|  | 147 | +    if (!finish_reason.empty()) { | 
|  | 148 | +        choices = json::array({json{{"finish_reason", finish_reason}, | 
|  | 149 | +                                    {"index", 0}, | 
|  | 150 | +                                    {"delta", json::object()}}}); | 
|  | 151 | +    } else { | 
|  | 152 | +        if (first) { | 
|  | 153 | +            if (content.empty()) { | 
|  | 154 | +                choices = json::array({json{{"finish_reason", nullptr}, | 
|  | 155 | +                                            {"index", 0}, | 
|  | 156 | +                                            {"delta", json{{"role", "assistant"}}}}}); | 
|  | 157 | +            } else { | 
|  | 158 | +                // We have to send this as two updates to conform to openai behavior | 
|  | 159 | +                json initial_ret = json{{"choices", json::array({json{ | 
|  | 160 | +                                        {"finish_reason", nullptr}, | 
|  | 161 | +                                        {"index", 0}, | 
|  | 162 | +                                        {"delta", json{ | 
|  | 163 | +                                            {"role", "assistant"} | 
|  | 164 | +                                        }}}})}, | 
|  | 165 | +                            {"created", t}, | 
|  | 166 | +                            {"id", gen_chatcmplid()}, | 
|  | 167 | +                            {"model", modelname}, | 
|  | 168 | +                            {"object", "chat.completion.chunk"}}; | 
|  | 169 | + | 
|  | 170 | +                json second_ret = json{ | 
|  | 171 | +                            {"choices", json::array({json{{"finish_reason", nullptr}, | 
|  | 172 | +                                                            {"index", 0}, | 
|  | 173 | +                                                            {"delta", json{ | 
|  | 174 | +                                                            {"content", content}}} | 
|  | 175 | +                                                            }})}, | 
|  | 176 | +                            {"created", t}, | 
|  | 177 | +                            {"id", gen_chatcmplid()}, | 
|  | 178 | +                            {"model", modelname}, | 
|  | 179 | +                            {"object", "chat.completion.chunk"}}; | 
|  | 180 | + | 
|  | 181 | +                return std::vector<json>({initial_ret, second_ret}); | 
|  | 182 | +            } | 
|  | 183 | +        } else { | 
|  | 184 | +            // Some idiosyncrasy in task processing logic makes several trailing calls | 
|  | 185 | +            // with empty content, we ignore these at the calee site. | 
|  | 186 | +            if (content.empty()) { | 
|  | 187 | +                return std::vector<json>({json::object()}); | 
|  | 188 | +            } | 
|  | 189 | + | 
|  | 190 | +            choices = json::array({json{ | 
|  | 191 | +                {"finish_reason", nullptr}, | 
|  | 192 | +                {"index", 0}, | 
|  | 193 | +                {"delta", | 
|  | 194 | +                json{ | 
|  | 195 | +                    {"content", content}, | 
|  | 196 | +                }}, | 
|  | 197 | +            }}); | 
|  | 198 | +        } | 
|  | 199 | +    } | 
|  | 200 | + | 
|  | 201 | +    json ret = json{{"choices", choices}, | 
|  | 202 | +                    {"created", t}, | 
|  | 203 | +                    {"id", gen_chatcmplid()}, | 
|  | 204 | +                    {"model", modelname}, | 
|  | 205 | +                    {"object", "chat.completion.chunk"}}; | 
|  | 206 | + | 
|  | 207 | +    return std::vector<json>({ret}); | 
|  | 208 | +} | 
0 commit comments