Drop confusing hardware_requirements from SKU

author: Ashwin Bharambe <ashwin@meta.com> 2024-09-13 15:52:39 -0700
committer: Ashwin Bharambe <ashwin@meta.com> 2024-09-13 15:52:39 -0700
commit: d37cbd85c7b30c275add641b5ee5d7c17165b6ca (patch)
tree: 8ce805b4ad95a4dd98ed3cf0af28c5095857e103
parent: 659a13ae2099748d23088fbf9293521de569dd8c (diff)
2 files changed, 17 insertions, 115 deletions
diff --git a/models/datatypes.py b/models/datatypes.py
index a41fc11..3fd6ab7 100644
--- a/models/datatypes.py
+++ b/models/datatypes.py
@@ -8,7 +8,7 @@
 from enum import Enum
 from typing import Any, Dict, Optional
 
-from pydantic import BaseModel, ConfigDict
+from pydantic import BaseModel, ConfigDict, Field
 
 from .schema_utils import json_schema_type
 
@@ -126,12 +126,6 @@ def model_family(model_id) -> ModelFamily:
         raise ValueError(f"Unknown model family for {CoreModelId}")
 
 
-@json_schema_type
-class HardwareRequirements(BaseModel):
-    memory_gb_per_gpu: int
-    gpu_count: int
-
-
 @json_schema_type(
     schema={
         "description": "The model family and SKU of the model along with other parameters corresponding to the model."
@@ -180,7 +174,6 @@ class Model(BaseModel):
     def variant(self) -> str:
         parts = [
             self.quantization_format.value,
-            f"mp{self.hardware_requirements.gpu_count}",
         ]
 
         return "-".join(parts)
@@ -194,12 +187,12 @@ class Model(BaseModel):
 
     description_markdown: str
     huggingface_repo: Optional[str] = None
-    hardware_requirements: HardwareRequirements
     quantization_format: CheckpointQuantizationFormat = (
         CheckpointQuantizationFormat.bf16
     )
     recommended_sampling_params: Optional[SamplingParams] = None
     model_args: Dict[str, Any]
+    metadata: Optional[Dict[str, Any]] = Field(default_factory=dict)
 
     @property
     def is_instruct_model(self) -> bool:
diff --git a/models/sku_list.py b/models/sku_list.py
index 801b965..b992bd8 100644
--- a/models/sku_list.py
+++ b/models/sku_list.py
@@ -11,7 +11,6 @@ from typing import List, Optional
 from .datatypes import (
     CheckpointQuantizationFormat,
     CoreModelId,
-    HardwareRequirements,
     Model,
     SamplingParams,
     SamplingStrategy,
@@ -72,10 +71,6 @@ def llama2_base_models() -> List[Model]:
             is_default_variant=True,
             description_markdown="Llama 2 7b model",
             huggingface_repo="meta-llama/Llama-2-7b",
-            hardware_requirements=HardwareRequirements(
-                gpu_count=1,
-                memory_gb_per_gpu=20,
-            ),
             recommended_sampling_params=recommended_sampling_params(),
             model_args={
                 "dim": 4096,
@@ -95,10 +90,6 @@ def llama2_base_models() -> List[Model]:
             is_default_variant=True,
             description_markdown="Llama 2 13b model",
             huggingface_repo="meta-llama/Llama-2-13b",
-            hardware_requirements=HardwareRequirements(
-                gpu_count=1,
-                memory_gb_per_gpu=28,
-            ),
             recommended_sampling_params=recommended_sampling_params(),
             model_args={
                 "dim": 5120,
@@ -118,10 +109,6 @@ def llama2_base_models() -> List[Model]:
             is_default_variant=True,
             description_markdown="Llama 2 70b model",
             huggingface_repo="meta-llama/Llama-2-70b",
-            hardware_requirements=HardwareRequirements(
-                gpu_count=3,
-                memory_gb_per_gpu=48,
-            ),
             recommended_sampling_params=recommended_sampling_params(),
             model_args={
                 "dim": 8192,
@@ -146,10 +133,6 @@ def llama3_base_models() -> List[Model]:
             is_default_variant=True,
             description_markdown="Llama 3 8b model",
             huggingface_repo="meta-llama/Meta-Llama-3-8B",
-            hardware_requirements=HardwareRequirements(
-                gpu_count=1,
-                memory_gb_per_gpu=20,
-            ),
             recommended_sampling_params=recommended_sampling_params(),
             model_args={
                 "dim": 4096,
@@ -169,10 +152,6 @@ def llama3_base_models() -> List[Model]:
             is_default_variant=True,
             description_markdown="Llama 3 70b model",
             huggingface_repo="meta-llama/Meta-Llama-3-70B",
-            hardware_requirements=HardwareRequirements(
-                gpu_count=8,
-                memory_gb_per_gpu=20,
-            ),
             recommended_sampling_params=recommended_sampling_params(),
             model_args={
                 "dim": 8192,
@@ -197,10 +176,6 @@ def llama3_1_base_models() -> List[Model]:
             is_default_variant=True,
             description_markdown="Llama 3.1 8b model",
             huggingface_repo="meta-llama/Meta-Llama-3.1-8B",
-            hardware_requirements=HardwareRequirements(
-                gpu_count=1,
-                memory_gb_per_gpu=20,
-            ),
             recommended_sampling_params=recommended_sampling_params(),
             model_args={
                 "dim": 4096,
@@ -220,10 +195,6 @@ def llama3_1_base_models() -> List[Model]:
             is_default_variant=True,
             description_markdown="Llama 3.1 70b model",
             huggingface_repo="meta-llama/Meta-Llama-3.1-70B",
-            hardware_requirements=HardwareRequirements(
-                gpu_count=8,
-                memory_gb_per_gpu=20,
-            ),
             recommended_sampling_params=recommended_sampling_params(),
             model_args={
                 "dim": 8192,
@@ -242,11 +213,7 @@ def llama3_1_base_models() -> List[Model]:
             core_model_id=CoreModelId.meta_llama3_1_405b,
             is_default_variant=False,
             description_markdown="Llama 3.1 405b model (BF16 weights)",
-            huggingface_repo=None,
-            hardware_requirements=HardwareRequirements(
-                gpu_count=8,
-                memory_gb_per_gpu=120,
-            ),
+            huggingface_repo="meta-llama/Meta-Llama-3.1-405B",
             recommended_sampling_params=recommended_sampling_params(),
             model_args={
                 "dim": 16384,
@@ -266,10 +233,6 @@ def llama3_1_base_models() -> List[Model]:
             is_default_variant=True,
             description_markdown="Llama 3.1 405b model (FP8 quantized)",
             huggingface_repo="meta-llama/Meta-Llama-3.1-405B-FP8",
-            hardware_requirements=HardwareRequirements(
-                gpu_count=8,
-                memory_gb_per_gpu=70,
-            ),
             quantization_format=CheckpointQuantizationFormat.fp8_mixed,
             recommended_sampling_params=recommended_sampling_params(),
             model_args={
@@ -288,12 +251,8 @@ def llama3_1_base_models() -> List[Model]:
         Model(
             core_model_id=CoreModelId.meta_llama3_1_405b,
             is_default_variant=False,
-            description_markdown="Llama 3.1 405b model (BF16 weights)",
+            description_markdown="Llama 3.1 405b model (BF16 weights for mp16)",
             huggingface_repo="meta-llama/Meta-Llama-3.1-405B",
-            hardware_requirements=HardwareRequirements(
-                gpu_count=16,
-                memory_gb_per_gpu=70,
-            ),
             recommended_sampling_params=recommended_sampling_params(),
             model_args={
                 "dim": 16384,
@@ -307,6 +266,9 @@ def llama3_1_base_models() -> List[Model]:
                 "rope_theta": 500000.0,
                 "use_scaled_rope": True,
             },
+            metadata={
+                "pth_file_count": 16,
+            },
         ),
     ]
 
@@ -318,10 +280,6 @@ def llama2_instruct_models() -> List[Model]:
             is_default_variant=True,
             description_markdown="Llama 2 7b chat model",
             huggingface_repo="meta-llama/Llama-2-7b-chat",
-            hardware_requirements=HardwareRequirements(
-                gpu_count=1,
-                memory_gb_per_gpu=14,
-            ),
             recommended_sampling_params=recommended_sampling_params(),
             model_args={
                 "dim": 4096,
@@ -341,10 +299,6 @@ def llama2_instruct_models() -> List[Model]:
             is_default_variant=True,
             description_markdown="Llama 2 13b chat model",
             huggingface_repo="meta-llama/Llama-2-13b-chat",
-            hardware_requirements=HardwareRequirements(
-                gpu_count=1,
-                memory_gb_per_gpu=28,
-            ),
             recommended_sampling_params=recommended_sampling_params(),
             model_args={
                 "dim": 5120,
@@ -364,10 +318,6 @@ def llama2_instruct_models() -> List[Model]:
             is_default_variant=True,
             description_markdown="Llama 2 70b chat model",
             huggingface_repo="meta-llama/Llama-2-70b-chat",
-            hardware_requirements=HardwareRequirements(
-                gpu_count=3,
-                memory_gb_per_gpu=48,
-            ),
             recommended_sampling_params=recommended_sampling_params(),
             model_args={
                 "dim": 8192,
@@ -392,10 +342,6 @@ def llama3_instruct_models() -> List[Model]:
             is_default_variant=True,
             description_markdown="Llama 3 8b instruct model",
             huggingface_repo="meta-llama/Meta-Llama-3-8B-Instruct",
-            hardware_requirements=HardwareRequirements(
-                gpu_count=1,
-                memory_gb_per_gpu=20,
-            ),
             recommended_sampling_params=recommended_sampling_params(),
             model_args={
                 "dim": 4096,
@@ -415,10 +361,6 @@ def llama3_instruct_models() -> List[Model]:
             is_default_variant=True,
             description_markdown="Llama 3 70b instruct model",
             huggingface_repo="meta-llama/Meta-Llama-3-70B-Instruct",
-            hardware_requirements=HardwareRequirements(
-                gpu_count=3,
-                memory_gb_per_gpu=48,
-            ),
             recommended_sampling_params=recommended_sampling_params(),
             model_args={
                 "dim": 8192,
@@ -443,10 +385,6 @@ def llama3_1_instruct_models() -> List[Model]:
             is_default_variant=True,
             description_markdown="Llama 3.1 8b instruct model",
             huggingface_repo="meta-llama/Meta-Llama-3.1-8B-Instruct",
-            hardware_requirements=HardwareRequirements(
-                gpu_count=1,
-                memory_gb_per_gpu=20,
-            ),
             recommended_sampling_params=recommended_sampling_params(),
             model_args={
                 "dim": 4096,
@@ -466,10 +404,6 @@ def llama3_1_instruct_models() -> List[Model]:
             is_default_variant=True,
             description_markdown="Llama 3.1 70b instruct model",
             huggingface_repo="meta-llama/Meta-Llama-3.1-70B-Instruct",
-            hardware_requirements=HardwareRequirements(
-                gpu_count=8,
-                memory_gb_per_gpu=20,
-            ),
             recommended_sampling_params=recommended_sampling_params(),
             model_args={
                 "dim": 8192,
@@ -488,11 +422,7 @@ def llama3_1_instruct_models() -> List[Model]:
             core_model_id=CoreModelId.meta_llama3_1_405b_instruct,
             is_default_variant=False,
             description_markdown="Llama 3.1 405b instruct model (BF16 weights)",
-            huggingface_repo=None,
-            hardware_requirements=HardwareRequirements(
-                gpu_count=8,
-                memory_gb_per_gpu=120,
-            ),
+            huggingface_repo="meta-llama/Meta-Llama-3.1-405B-Instruct",
             recommended_sampling_params=recommended_sampling_params(),
             model_args={
                 "dim": 16384,
@@ -512,10 +442,6 @@ def llama3_1_instruct_models() -> List[Model]:
             is_default_variant=True,
             description_markdown="Llama 3.1 405b instruct model (FP8 quantized)",
             huggingface_repo="meta-llama/Meta-Llama-3.1-405B-Instruct-FP8",
-            hardware_requirements=HardwareRequirements(
-                gpu_count=8,
-                memory_gb_per_gpu=70,
-            ),
             quantization_format=CheckpointQuantizationFormat.fp8_mixed,
             recommended_sampling_params=recommended_sampling_params(),
             model_args={
@@ -534,12 +460,8 @@ def llama3_1_instruct_models() -> List[Model]:
         Model(
             core_model_id=CoreModelId.meta_llama3_1_405b_instruct,
             is_default_variant=False,
-            description_markdown="Llama 3.1 405b instruct model (BF16 weights)",
+            description_markdown="Llama 3.1 405b instruct model (BF16 weights for mp16)",
             huggingface_repo="meta-llama/Meta-Llama-3.1-405B-Instruct",
-            hardware_requirements=HardwareRequirements(
-                gpu_count=16,
-                memory_gb_per_gpu=70,
-            ),
             recommended_sampling_params=recommended_sampling_params(),
             model_args={
                 "dim": 16384,
@@ -553,6 +475,9 @@ def llama3_1_instruct_models() -> List[Model]:
                 "rope_theta": 500000.0,
                 "use_scaled_rope": True,
             },
+            metadata={
+                "pth_file_count": 16,
+            },
         ),
     ]
 
@@ -565,10 +490,6 @@ def safety_models() -> List[Model]:
             is_default_variant=True,
             description_markdown="Llama Guard v3 8b system safety model",
             huggingface_repo="meta-llama/Llama-Guard-3-8B",
-            hardware_requirements=HardwareRequirements(
-                gpu_count=1,
-                memory_gb_per_gpu=20,
-            ),
             model_args={
                 "dim": 4096,
                 "ffn_dim_multiplier": 1.3,
@@ -588,10 +509,6 @@ def safety_models() -> List[Model]:
             description_markdown="Llama Guard v3 8b system safety model",
             huggingface_repo="meta-llama/Llama-Guard-3-8B-INT8",
             quantization_format=CheckpointQuantizationFormat.int8,
-            hardware_requirements=HardwareRequirements(
-                gpu_count=1,
-                memory_gb_per_gpu=10,
-            ),
             model_args={
                 "dim": 4096,
                 "ffn_dim_multiplier": 1.3,
@@ -610,10 +527,6 @@ def safety_models() -> List[Model]:
             is_default_variant=True,
             description_markdown="Prompt Guard 86M injection safety model",
             huggingface_repo="meta-llama/Prompt-Guard-86M",
-            hardware_requirements=HardwareRequirements(
-                gpu_count=1,
-                memory_gb_per_gpu=1,
-            ),
             model_args={},
         ),
         Model(
@@ -621,10 +534,6 @@ def safety_models() -> List[Model]:
             is_default_variant=True,
             description_markdown="Llama Guard v2 8b system safety model",
             huggingface_repo="meta-llama/Meta-Llama-Guard-2-8B",
-            hardware_requirements=HardwareRequirements(
-                gpu_count=1,
-                memory_gb_per_gpu=20,
-            ),
             model_args={
                 "dim": 4096,
                 "n_layers": 32,
@@ -654,16 +563,16 @@ class LlamaDownloadInfo:
 def llama_meta_net_info(model: Model) -> LlamaDownloadInfo:
     """Information needed to download model from llamameta.net"""
 
-    gpu = model.hardware_requirements.gpu_count
+    pth_count = model.metadata.get("pth_file_count", 16)
     if model.core_model_id == CoreModelId.meta_llama3_1_405b:
-        if gpu == 16:
+        if pth_count == 16:
             folder = "Meta-Llama-3.1-405B-MP16"
         elif model.quantization_format == CheckpointQuantizationFormat.fp8_mixed:
             folder = "Meta-Llama-3.1-405B"
         else:
             folder = "Meta-Llama-3.1-405B-MP8"
     elif model.core_model_id == CoreModelId.meta_llama3_1_405b_instruct:
-        if gpu == 16:
+        if pth_count == 16:
             folder = "Meta-Llama-3.1-405B-Instruct-MP16"
         elif model.quantization_format == CheckpointQuantizationFormat.fp8_mixed:
             folder = "Meta-Llama-3.1-405B-Instruct"
@@ -734,8 +643,8 @@ def llama_meta_pth_size(model: Model) -> int:
     ):
         return 0
 
-    gpu = model.hardware_requirements.gpu_count
-    if gpu == 16:
+    pth_count = model.metadata.get("pth_file_count", 0)
+    if pth_count == 16:
         return 51268302389
     elif model.quantization_format == CheckpointQuantizationFormat.fp8_mixed:
         return 60903742309
author	Ashwin Bharambe <ashwin@meta.com>	2024-09-13 15:52:39 -0700
committer	Ashwin Bharambe <ashwin@meta.com>	2024-09-13 15:52:39 -0700
commit	d37cbd85c7b30c275add641b5ee5d7c17165b6ca (patch)
tree	8ce805b4ad95a4dd98ed3cf0af28c5095857e103
parent	659a13ae2099748d23088fbf9293521de569dd8c (diff)