diff options
author | Ashwin Bharambe <ashwin@meta.com> | 2024-09-13 15:52:39 -0700 |
---|---|---|
committer | Ashwin Bharambe <ashwin@meta.com> | 2024-09-13 15:52:39 -0700 |
commit | d37cbd85c7b30c275add641b5ee5d7c17165b6ca (patch) | |
tree | 8ce805b4ad95a4dd98ed3cf0af28c5095857e103 | |
parent | 659a13ae2099748d23088fbf9293521de569dd8c (diff) |
Drop confusing hardware_requirements from SKU
-rw-r--r-- | models/datatypes.py | 11 | ||||
-rw-r--r-- | models/sku_list.py | 121 |
2 files changed, 17 insertions, 115 deletions
diff --git a/models/datatypes.py b/models/datatypes.py index a41fc11..3fd6ab7 100644 --- a/models/datatypes.py +++ b/models/datatypes.py @@ -8,7 +8,7 @@ from enum import Enum from typing import Any, Dict, Optional -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict, Field from .schema_utils import json_schema_type @@ -126,12 +126,6 @@ def model_family(model_id) -> ModelFamily: raise ValueError(f"Unknown model family for {CoreModelId}") -@json_schema_type -class HardwareRequirements(BaseModel): - memory_gb_per_gpu: int - gpu_count: int - - @json_schema_type( schema={ "description": "The model family and SKU of the model along with other parameters corresponding to the model." @@ -180,7 +174,6 @@ class Model(BaseModel): def variant(self) -> str: parts = [ self.quantization_format.value, - f"mp{self.hardware_requirements.gpu_count}", ] return "-".join(parts) @@ -194,12 +187,12 @@ class Model(BaseModel): description_markdown: str huggingface_repo: Optional[str] = None - hardware_requirements: HardwareRequirements quantization_format: CheckpointQuantizationFormat = ( CheckpointQuantizationFormat.bf16 ) recommended_sampling_params: Optional[SamplingParams] = None model_args: Dict[str, Any] + metadata: Optional[Dict[str, Any]] = Field(default_factory=dict) @property def is_instruct_model(self) -> bool: diff --git a/models/sku_list.py b/models/sku_list.py index 801b965..b992bd8 100644 --- a/models/sku_list.py +++ b/models/sku_list.py @@ -11,7 +11,6 @@ from typing import List, Optional from .datatypes import ( CheckpointQuantizationFormat, CoreModelId, - HardwareRequirements, Model, SamplingParams, SamplingStrategy, @@ -72,10 +71,6 @@ def llama2_base_models() -> List[Model]: is_default_variant=True, description_markdown="Llama 2 7b model", huggingface_repo="meta-llama/Llama-2-7b", - hardware_requirements=HardwareRequirements( - gpu_count=1, - memory_gb_per_gpu=20, - ), recommended_sampling_params=recommended_sampling_params(), model_args={ "dim": 4096, @@ -95,10 +90,6 @@ def llama2_base_models() -> List[Model]: is_default_variant=True, description_markdown="Llama 2 13b model", huggingface_repo="meta-llama/Llama-2-13b", - hardware_requirements=HardwareRequirements( - gpu_count=1, - memory_gb_per_gpu=28, - ), recommended_sampling_params=recommended_sampling_params(), model_args={ "dim": 5120, @@ -118,10 +109,6 @@ def llama2_base_models() -> List[Model]: is_default_variant=True, description_markdown="Llama 2 70b model", huggingface_repo="meta-llama/Llama-2-70b", - hardware_requirements=HardwareRequirements( - gpu_count=3, - memory_gb_per_gpu=48, - ), recommended_sampling_params=recommended_sampling_params(), model_args={ "dim": 8192, @@ -146,10 +133,6 @@ def llama3_base_models() -> List[Model]: is_default_variant=True, description_markdown="Llama 3 8b model", huggingface_repo="meta-llama/Meta-Llama-3-8B", - hardware_requirements=HardwareRequirements( - gpu_count=1, - memory_gb_per_gpu=20, - ), recommended_sampling_params=recommended_sampling_params(), model_args={ "dim": 4096, @@ -169,10 +152,6 @@ def llama3_base_models() -> List[Model]: is_default_variant=True, description_markdown="Llama 3 70b model", huggingface_repo="meta-llama/Meta-Llama-3-70B", - hardware_requirements=HardwareRequirements( - gpu_count=8, - memory_gb_per_gpu=20, - ), recommended_sampling_params=recommended_sampling_params(), model_args={ "dim": 8192, @@ -197,10 +176,6 @@ def llama3_1_base_models() -> List[Model]: is_default_variant=True, description_markdown="Llama 3.1 8b model", huggingface_repo="meta-llama/Meta-Llama-3.1-8B", - hardware_requirements=HardwareRequirements( - gpu_count=1, - memory_gb_per_gpu=20, - ), recommended_sampling_params=recommended_sampling_params(), model_args={ "dim": 4096, @@ -220,10 +195,6 @@ def llama3_1_base_models() -> List[Model]: is_default_variant=True, description_markdown="Llama 3.1 70b model", huggingface_repo="meta-llama/Meta-Llama-3.1-70B", - hardware_requirements=HardwareRequirements( - gpu_count=8, - memory_gb_per_gpu=20, - ), recommended_sampling_params=recommended_sampling_params(), model_args={ "dim": 8192, @@ -242,11 +213,7 @@ def llama3_1_base_models() -> List[Model]: core_model_id=CoreModelId.meta_llama3_1_405b, is_default_variant=False, description_markdown="Llama 3.1 405b model (BF16 weights)", - huggingface_repo=None, - hardware_requirements=HardwareRequirements( - gpu_count=8, - memory_gb_per_gpu=120, - ), + huggingface_repo="meta-llama/Meta-Llama-3.1-405B", recommended_sampling_params=recommended_sampling_params(), model_args={ "dim": 16384, @@ -266,10 +233,6 @@ def llama3_1_base_models() -> List[Model]: is_default_variant=True, description_markdown="Llama 3.1 405b model (FP8 quantized)", huggingface_repo="meta-llama/Meta-Llama-3.1-405B-FP8", - hardware_requirements=HardwareRequirements( - gpu_count=8, - memory_gb_per_gpu=70, - ), quantization_format=CheckpointQuantizationFormat.fp8_mixed, recommended_sampling_params=recommended_sampling_params(), model_args={ @@ -288,12 +251,8 @@ def llama3_1_base_models() -> List[Model]: Model( core_model_id=CoreModelId.meta_llama3_1_405b, is_default_variant=False, - description_markdown="Llama 3.1 405b model (BF16 weights)", + description_markdown="Llama 3.1 405b model (BF16 weights for mp16)", huggingface_repo="meta-llama/Meta-Llama-3.1-405B", - hardware_requirements=HardwareRequirements( - gpu_count=16, - memory_gb_per_gpu=70, - ), recommended_sampling_params=recommended_sampling_params(), model_args={ "dim": 16384, @@ -307,6 +266,9 @@ def llama3_1_base_models() -> List[Model]: "rope_theta": 500000.0, "use_scaled_rope": True, }, + metadata={ + "pth_file_count": 16, + }, ), ] @@ -318,10 +280,6 @@ def llama2_instruct_models() -> List[Model]: is_default_variant=True, description_markdown="Llama 2 7b chat model", huggingface_repo="meta-llama/Llama-2-7b-chat", - hardware_requirements=HardwareRequirements( - gpu_count=1, - memory_gb_per_gpu=14, - ), recommended_sampling_params=recommended_sampling_params(), model_args={ "dim": 4096, @@ -341,10 +299,6 @@ def llama2_instruct_models() -> List[Model]: is_default_variant=True, description_markdown="Llama 2 13b chat model", huggingface_repo="meta-llama/Llama-2-13b-chat", - hardware_requirements=HardwareRequirements( - gpu_count=1, - memory_gb_per_gpu=28, - ), recommended_sampling_params=recommended_sampling_params(), model_args={ "dim": 5120, @@ -364,10 +318,6 @@ def llama2_instruct_models() -> List[Model]: is_default_variant=True, description_markdown="Llama 2 70b chat model", huggingface_repo="meta-llama/Llama-2-70b-chat", - hardware_requirements=HardwareRequirements( - gpu_count=3, - memory_gb_per_gpu=48, - ), recommended_sampling_params=recommended_sampling_params(), model_args={ "dim": 8192, @@ -392,10 +342,6 @@ def llama3_instruct_models() -> List[Model]: is_default_variant=True, description_markdown="Llama 3 8b instruct model", huggingface_repo="meta-llama/Meta-Llama-3-8B-Instruct", - hardware_requirements=HardwareRequirements( - gpu_count=1, - memory_gb_per_gpu=20, - ), recommended_sampling_params=recommended_sampling_params(), model_args={ "dim": 4096, @@ -415,10 +361,6 @@ def llama3_instruct_models() -> List[Model]: is_default_variant=True, description_markdown="Llama 3 70b instruct model", huggingface_repo="meta-llama/Meta-Llama-3-70B-Instruct", - hardware_requirements=HardwareRequirements( - gpu_count=3, - memory_gb_per_gpu=48, - ), recommended_sampling_params=recommended_sampling_params(), model_args={ "dim": 8192, @@ -443,10 +385,6 @@ def llama3_1_instruct_models() -> List[Model]: is_default_variant=True, description_markdown="Llama 3.1 8b instruct model", huggingface_repo="meta-llama/Meta-Llama-3.1-8B-Instruct", - hardware_requirements=HardwareRequirements( - gpu_count=1, - memory_gb_per_gpu=20, - ), recommended_sampling_params=recommended_sampling_params(), model_args={ "dim": 4096, @@ -466,10 +404,6 @@ def llama3_1_instruct_models() -> List[Model]: is_default_variant=True, description_markdown="Llama 3.1 70b instruct model", huggingface_repo="meta-llama/Meta-Llama-3.1-70B-Instruct", - hardware_requirements=HardwareRequirements( - gpu_count=8, - memory_gb_per_gpu=20, - ), recommended_sampling_params=recommended_sampling_params(), model_args={ "dim": 8192, @@ -488,11 +422,7 @@ def llama3_1_instruct_models() -> List[Model]: core_model_id=CoreModelId.meta_llama3_1_405b_instruct, is_default_variant=False, description_markdown="Llama 3.1 405b instruct model (BF16 weights)", - huggingface_repo=None, - hardware_requirements=HardwareRequirements( - gpu_count=8, - memory_gb_per_gpu=120, - ), + huggingface_repo="meta-llama/Meta-Llama-3.1-405B-Instruct", recommended_sampling_params=recommended_sampling_params(), model_args={ "dim": 16384, @@ -512,10 +442,6 @@ def llama3_1_instruct_models() -> List[Model]: is_default_variant=True, description_markdown="Llama 3.1 405b instruct model (FP8 quantized)", huggingface_repo="meta-llama/Meta-Llama-3.1-405B-Instruct-FP8", - hardware_requirements=HardwareRequirements( - gpu_count=8, - memory_gb_per_gpu=70, - ), quantization_format=CheckpointQuantizationFormat.fp8_mixed, recommended_sampling_params=recommended_sampling_params(), model_args={ @@ -534,12 +460,8 @@ def llama3_1_instruct_models() -> List[Model]: Model( core_model_id=CoreModelId.meta_llama3_1_405b_instruct, is_default_variant=False, - description_markdown="Llama 3.1 405b instruct model (BF16 weights)", + description_markdown="Llama 3.1 405b instruct model (BF16 weights for mp16)", huggingface_repo="meta-llama/Meta-Llama-3.1-405B-Instruct", - hardware_requirements=HardwareRequirements( - gpu_count=16, - memory_gb_per_gpu=70, - ), recommended_sampling_params=recommended_sampling_params(), model_args={ "dim": 16384, @@ -553,6 +475,9 @@ def llama3_1_instruct_models() -> List[Model]: "rope_theta": 500000.0, "use_scaled_rope": True, }, + metadata={ + "pth_file_count": 16, + }, ), ] @@ -565,10 +490,6 @@ def safety_models() -> List[Model]: is_default_variant=True, description_markdown="Llama Guard v3 8b system safety model", huggingface_repo="meta-llama/Llama-Guard-3-8B", - hardware_requirements=HardwareRequirements( - gpu_count=1, - memory_gb_per_gpu=20, - ), model_args={ "dim": 4096, "ffn_dim_multiplier": 1.3, @@ -588,10 +509,6 @@ def safety_models() -> List[Model]: description_markdown="Llama Guard v3 8b system safety model", huggingface_repo="meta-llama/Llama-Guard-3-8B-INT8", quantization_format=CheckpointQuantizationFormat.int8, - hardware_requirements=HardwareRequirements( - gpu_count=1, - memory_gb_per_gpu=10, - ), model_args={ "dim": 4096, "ffn_dim_multiplier": 1.3, @@ -610,10 +527,6 @@ def safety_models() -> List[Model]: is_default_variant=True, description_markdown="Prompt Guard 86M injection safety model", huggingface_repo="meta-llama/Prompt-Guard-86M", - hardware_requirements=HardwareRequirements( - gpu_count=1, - memory_gb_per_gpu=1, - ), model_args={}, ), Model( @@ -621,10 +534,6 @@ def safety_models() -> List[Model]: is_default_variant=True, description_markdown="Llama Guard v2 8b system safety model", huggingface_repo="meta-llama/Meta-Llama-Guard-2-8B", - hardware_requirements=HardwareRequirements( - gpu_count=1, - memory_gb_per_gpu=20, - ), model_args={ "dim": 4096, "n_layers": 32, @@ -654,16 +563,16 @@ class LlamaDownloadInfo: def llama_meta_net_info(model: Model) -> LlamaDownloadInfo: """Information needed to download model from llamameta.net""" - gpu = model.hardware_requirements.gpu_count + pth_count = model.metadata.get("pth_file_count", 16) if model.core_model_id == CoreModelId.meta_llama3_1_405b: - if gpu == 16: + if pth_count == 16: folder = "Meta-Llama-3.1-405B-MP16" elif model.quantization_format == CheckpointQuantizationFormat.fp8_mixed: folder = "Meta-Llama-3.1-405B" else: folder = "Meta-Llama-3.1-405B-MP8" elif model.core_model_id == CoreModelId.meta_llama3_1_405b_instruct: - if gpu == 16: + if pth_count == 16: folder = "Meta-Llama-3.1-405B-Instruct-MP16" elif model.quantization_format == CheckpointQuantizationFormat.fp8_mixed: folder = "Meta-Llama-3.1-405B-Instruct" @@ -734,8 +643,8 @@ def llama_meta_pth_size(model: Model) -> int: ): return 0 - gpu = model.hardware_requirements.gpu_count - if gpu == 16: + pth_count = model.metadata.get("pth_file_count", 0) + if pth_count == 16: return 51268302389 elif model.quantization_format == CheckpointQuantizationFormat.fp8_mixed: return 60903742309 |