Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
File renamed without changes.
File renamed without changes.
4 changes: 2 additions & 2 deletions cacheflow/master/scheduler.py → cacheflow/core/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
import time
from typing import Any, Dict, List, Optional, Tuple

from cacheflow.master.block_manager import BlockSpaceManager
from cacheflow.master.policy import PolicyFactory
from cacheflow.core.block_manager import BlockSpaceManager
from cacheflow.core.policy import PolicyFactory
from cacheflow.sampling_params import SamplingParams
from cacheflow.sequence import Sequence
from cacheflow.sequence import SequenceGroup
Expand Down
9 changes: 5 additions & 4 deletions cacheflow/master/server.py → cacheflow/core/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,21 @@
except ImportError:
ray = None

from cacheflow.core.scheduler import Scheduler
from cacheflow.frontend.simple_frontend import SimpleFrontend
from cacheflow.logger import init_logger
from cacheflow.master.scheduler import Scheduler
from cacheflow.master.simple_frontend import SimpleFrontend
from cacheflow.models import get_memory_analyzer
from cacheflow.worker.controller import Controller, DeviceID
from cacheflow.model_executor import get_memory_analyzer
from cacheflow.sequence import SequenceGroup
from cacheflow.sampling_params import SamplingParams
from cacheflow.utils import get_gpu_memory, get_cpu_memory
from cacheflow.worker.controller import Controller, DeviceID


logger = init_logger(__name__)


class Server:

def __init__(
self,
model: str,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
import argparse
import asyncio
import json
import time
from typing import List, Dict, Optional
import json

import ray
from transformers import AutoTokenizer
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
import ray
from transformers import AutoTokenizer
import uvicorn

from cacheflow.core.server import (Server, add_server_arguments,
process_server_arguments,
initialize_cluster)
from cacheflow.sampling_params import SamplingParams
from cacheflow.sequence import Sequence, SequenceGroup
from cacheflow.master.server import (Server, add_server_arguments,
process_server_arguments,
initialize_cluster)
from cacheflow.worker.controller import DeviceID
from cacheflow.utils import Counter, get_gpu_memory, get_cpu_memory
from cacheflow.worker.controller import DeviceID

TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds
app = FastAPI()
Expand Down
11 changes: 11 additions & 0 deletions cacheflow/model_executor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from cacheflow.model_executor.input_metadata import InputMetadata
from cacheflow.model_executor.model_loader import get_model, get_memory_analyzer
from cacheflow.model_executor.utils import set_random_seed


__all__ = [
"InputMetadata",
"get_model",
"get_memory_analyzer",
"set_random_seed",
]
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from cacheflow import attention_ops
from cacheflow import cache_ops
from cacheflow import pos_encoding_ops
from cacheflow.models import InputMetadata
from cacheflow.model_executor.input_metadata import InputMetadata


class GPTCacheFlowAttention(nn.Module):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
import torch
import torch.nn as nn

from cacheflow.models import InputMetadata
from cacheflow.model_executor.input_metadata import InputMetadata
from cacheflow.model_executor.parallel_utils.tensor_parallel import (
gather_from_tensor_model_parallel_region)
from cacheflow.sampling_params import SamplingParams
from cacheflow.sequence import SequenceOutputs
from cacheflow.parallel_utils.tensor_parallel import gather_from_tensor_model_parallel_region


class Sampler(nn.Module):
Expand All @@ -27,7 +28,7 @@ def forward(
# Get the logits for the next tokens.
logits = torch.matmul(hidden_states, embedding.t())
logits = gather_from_tensor_model_parallel_region(logits)
# Remove paddings in vocab.
# Remove paddings in vocab (if any).
logits = logits[:, :self.vocab_size]

# Apply temperature scaling.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from transformers import AutoConfig

from cacheflow.logger import init_logger
from cacheflow.models.utils import get_dtype_size
from cacheflow.model_executor.utils import get_dtype_size


logger = init_logger(__name__)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,13 @@
from transformers import AutoConfig
from transformers import PretrainedConfig

from cacheflow.models.memory_analyzer import CacheFlowMemoryAnalyzer
from cacheflow.models.memory_analyzer import GPT2MemoryAnalyzer
from cacheflow.models.memory_analyzer import GPTNeoXMemoryAnalyzer
from cacheflow.models.memory_analyzer import LlamaMemoryAnalyzer
from cacheflow.models.memory_analyzer import OPTMemoryAnalyzer
from cacheflow.models.gpt2 import GPT2LMHeadModel
from cacheflow.models.gpt_neox import GPTNeoXForCausalLM
from cacheflow.models.llama import LlamaForCausalLM
from cacheflow.models.opt import OPTForCausalLM
from cacheflow.models.utils import get_torch_dtype
from cacheflow.model_executor.memory_analyzer import (
CacheFlowMemoryAnalyzer, GPT2MemoryAnalyzer, GPTNeoXMemoryAnalyzer,
LlamaMemoryAnalyzer, OPTMemoryAnalyzer)
from cacheflow.model_executor.models import (
GPT2LMHeadModel, GPTNeoXForCausalLM, LlamaForCausalLM, OPTForCausalLM)
from cacheflow.model_executor.utils import get_torch_dtype
from cacheflow.model_executor.weight_utils import initialize_dummy_weights


_MODELS = {
Expand Down Expand Up @@ -77,7 +74,7 @@ def get_model(
model = model.cuda()
# NOTE(woosuk): For precise performance evaluation, we assign
# random values to the weights.
model.initialize_dummy_weights()
initialize_dummy_weights(model)
else:
# Create a model instance.
model = model_class(config)
Expand Down
12 changes: 12 additions & 0 deletions cacheflow/model_executor/models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from cacheflow.model_executor.models.gpt_neox import GPTNeoXForCausalLM
from cacheflow.model_executor.models.gpt2 import GPT2LMHeadModel
from cacheflow.model_executor.models.llama import LlamaForCausalLM
from cacheflow.model_executor.models.opt import OPTForCausalLM


__all__ = [
"GPT2LMHeadModel",
"GPTNeoXForCausalLM",
"LlamaForCausalLM",
"OPTForCausalLM",
]
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,15 @@
from torch import nn
from transformers import GPT2Config

from cacheflow.models import InputMetadata
from cacheflow.models.attention import GPTCacheFlowAttention
from cacheflow.models.sample import Sampler
from cacheflow.models.utils import (hf_model_weights_iterator,
load_tensor_parallel_weights)
from cacheflow.parallel_utils.parallel_state import (
from cacheflow.model_executor.input_metadata import InputMetadata
from cacheflow.model_executor.layers.attention import GPTCacheFlowAttention
from cacheflow.model_executor.layers.sampler import Sampler
from cacheflow.model_executor.weight_utils import (hf_model_weights_iterator,
load_tensor_parallel_weights)
from cacheflow.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
from cacheflow.parallel_utils.tensor_parallel import (VocabParallelEmbedding,
ColumnParallelLinear,
RowParallelLinear)
from cacheflow.model_executor.parallel_utils.tensor_parallel import (
VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
from cacheflow.sequence import SequenceOutputs

KVCache = Tuple[torch.Tensor, torch.Tensor]
Expand Down Expand Up @@ -258,8 +257,5 @@ def load_weights(self, model_name_or_path: str,
raise ValueError(f"Unexpected parameter name {name}")
load_tensor_parallel_weights(param, loaded_weight, name,
self._column_parallel_weights,
self._row_parallel_weights)

def initialize_dummy_weights(self) -> None:
for param in self.state_dict().values():
param.data.uniform_(-1e-3, 1e-3)
self._row_parallel_weights,
tensor_model_parallel_rank)
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,25 @@

import torch
from torch import nn

from cacheflow.models import InputMetadata
from cacheflow.models.attention import GPTNeoXCacheFlowAttention
from cacheflow.models.sample import Sampler
from cacheflow.models.utils import (hf_model_weights_iterator,
load_tensor_parallel_weights)
from cacheflow.parallel_utils.parallel_state import (
from transformers import GPTNeoXConfig

from cacheflow.model_executor.input_metadata import InputMetadata
from cacheflow.model_executor.layers.attention import GPTNeoXCacheFlowAttention
from cacheflow.model_executor.layers.sampler import Sampler
from cacheflow.model_executor.weight_utils import (hf_model_weights_iterator,
load_tensor_parallel_weights)
from cacheflow.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
from cacheflow.parallel_utils.tensor_parallel import (VocabParallelEmbedding,
ColumnParallelLinear,
RowParallelLinear)
from cacheflow.model_executor.parallel_utils.tensor_parallel import (
VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
from cacheflow.sequence import SequenceOutputs

KVCache = Tuple[torch.Tensor, torch.Tensor]


class GPTNeoXAttention(nn.Module):

def __init__(self, config):
def __init__(self, config: GPTNeoXConfig):
super().__init__()
self.total_num_heads = config.num_attention_heads
self.hidden_size = config.hidden_size
Expand Down Expand Up @@ -63,7 +63,7 @@ def forward(


class GPTNeoXMLP(nn.Module):
def __init__(self, config):
def __init__(self, config: GPTNeoXConfig):
super().__init__()
self.dense_h_to_4h = ColumnParallelLinear(config.hidden_size,
config.intermediate_size,
Expand All @@ -86,7 +86,7 @@ def forward(self, hidden_states):

class GPTNeoXLayer(nn.Module):

def __init__(self, config):
def __init__(self, config: GPTNeoXConfig):
super().__init__()
self.use_parallel_residual = config.use_parallel_residual
self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
Expand Down Expand Up @@ -129,7 +129,7 @@ def forward(


class GPTNeoXModel(nn.Module):
def __init__(self, config):
def __init__(self, config: GPTNeoXConfig):
super().__init__()
self.config = config

Expand Down Expand Up @@ -227,8 +227,5 @@ def load_weights(self, model_name_or_path: str,
raise ValueError(f"Unexpected weight name: {name}")
load_tensor_parallel_weights(param, loaded_weight, name,
self._column_parallel_weights,
self._row_parallel_weights)

def initialize_dummy_weights(self) -> None:
for param in self.state_dict().values():
param.data.uniform_(-1e-3, 1e-3)
self._row_parallel_weights,
tensor_model_parallel_rank)
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,18 @@
from torch import nn
from transformers import LlamaConfig

from cacheflow.models import InputMetadata
from cacheflow.models.activation import SiluAndMul
from cacheflow.models.attention import GPTNeoXCacheFlowAttention
from cacheflow.models.layernorm import RMSNorm
from cacheflow.models.sample import Sampler
from cacheflow.models.utils import (hf_model_weights_iterator,
load_tensor_parallel_weights)
from cacheflow.parallel_utils.parallel_state import (
from cacheflow.sequence import SequenceOutputs
from cacheflow.model_executor.input_metadata import InputMetadata
from cacheflow.model_executor.layers.activation import SiluAndMul
from cacheflow.model_executor.layers.layernorm import RMSNorm
from cacheflow.model_executor.layers.attention import GPTNeoXCacheFlowAttention
from cacheflow.model_executor.layers.sampler import Sampler
from cacheflow.model_executor.weight_utils import (hf_model_weights_iterator,
load_tensor_parallel_weights)
from cacheflow.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
from cacheflow.parallel_utils.tensor_parallel import (VocabParallelEmbedding,
ColumnParallelLinear,
RowParallelLinear)
from cacheflow.model_executor.parallel_utils.tensor_parallel import (
VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
from cacheflow.sequence import SequenceOutputs

KVCache = Tuple[torch.Tensor, torch.Tensor]
Expand Down Expand Up @@ -263,8 +263,5 @@ def load_weights(self, model_name_or_path: str,
param = state_dict[name]
load_tensor_parallel_weights(param, loaded_weight, name,
self._column_parallel_weights,
self._row_parallel_weights)

def initialize_dummy_weights(self) -> None:
for param in self.state_dict().values():
param.data.uniform_(-1e-3, 1e-3)
self._row_parallel_weights,
tensor_model_parallel_rank)
24 changes: 10 additions & 14 deletions cacheflow/models/opt.py → cacheflow/model_executor/models/opt.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,15 @@
from torch import nn
from transformers import OPTConfig

from cacheflow.models import InputMetadata
from cacheflow.models.attention import GPTCacheFlowAttention
from cacheflow.models.sample import Sampler
from cacheflow.models.utils import (hf_model_weights_iterator,
load_tensor_parallel_weights)
from cacheflow.parallel_utils.parallel_state import (
from cacheflow.model_executor.input_metadata import InputMetadata
from cacheflow.model_executor.layers.attention import GPTCacheFlowAttention
from cacheflow.model_executor.layers.sampler import Sampler
from cacheflow.model_executor.weight_utils import (hf_model_weights_iterator,
load_tensor_parallel_weights)
from cacheflow.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
from cacheflow.parallel_utils.tensor_parallel import (VocabParallelEmbedding,
ColumnParallelLinear,
RowParallelLinear)
from cacheflow.model_executor.parallel_utils.tensor_parallel import (
VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
from cacheflow.sequence import SequenceOutputs

KVCache = Tuple[torch.Tensor, torch.Tensor]
Expand Down Expand Up @@ -288,8 +287,5 @@ def load_weights(self, model_name_or_path: str,
param = state_dict[name]
load_tensor_parallel_weights(param, loaded_weight, name,
self._column_parallel_weights,
self._row_parallel_weights)

def initialize_dummy_weights(self) -> None:
for param in self.state_dict().values():
param.data.uniform_(-1e-3, 1e-3)
self._row_parallel_weights,
tensor_model_parallel_rank)
12 changes: 12 additions & 0 deletions cacheflow/model_executor/parallel_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import cacheflow.model_executor.parallel_utils.parallel_state
import cacheflow.model_executor.parallel_utils.tensor_parallel
import cacheflow.model_executor.parallel_utils.utils

# Alias parallel_state as mpu, its legacy name
mpu = parallel_state

__all__ = [
"parallel_state",
"tensor_parallel",
"utils",
]
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import torch.nn.init as init
from torch.nn.parameter import Parameter

from cacheflow.parallel_utils.parallel_state import (
from cacheflow.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size,
get_all_reduce_launcher,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import torch

from cacheflow.parallel_utils.parallel_state import (
from cacheflow.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size,
get_tensor_model_parallel_group,
Expand Down
Loading