[
  {
    "hf_id": "ByteDance-Seed/Seed-OSS-36B-Instruct",
    "hf_org": "ByteDance-Seed",
    "hf_repo": "Seed-OSS-36B-Instruct",
    "title": "Seed-OSS-36B",
    "provider": "Seed (ByteDance)",
    "description": "ByteDance Seed-OSS 36B dense model with unique 'thinking budget' control and 512K context support",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.11.0",
    "architecture": "dense",
    "parameter_count": "36B",
    "active_parameters": "36B",
    "context_length": 524288,
    "tasks": [
      "text"
    ],
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 86
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp"
    ],
    "features": [
      "tool_calling"
    ],
    "opt_in_features": [],
    "url": "/ByteDance-Seed/Seed-OSS-36B-Instruct",
    "json": "/ByteDance-Seed/Seed-OSS-36B-Instruct.json"
  },
  {
    "hf_id": "Google/gemma-4-31B-it",
    "hf_org": "Google",
    "hf_repo": "gemma-4-31B-it",
    "title": "Gemma 4 31B IT",
    "provider": "Google",
    "description": "Google's unified multimodal Gemma 4 dense model (31B) with native text, image, and audio, plus thinking mode and tool-use protocol.",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.12.0",
    "architecture": "dense",
    "parameter_count": "31B",
    "active_parameters": "31B",
    "context_length": 262144,
    "tasks": [
      "multimodal",
      "text"
    ],
    "performance_headline": "Unified multimodal model with structured thinking, function calling, dynamic vision resolution",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 210
      },
      "nvfp4": {
        "precision": "nvfp4",
        "vram_minimum_gb": 19
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp"
    ],
    "features": [
      "tool_calling",
      "reasoning"
    ],
    "opt_in_features": [],
    "url": "/Google/gemma-4-31B-it",
    "json": "/Google/gemma-4-31B-it.json"
  },
  {
    "hf_id": "Google/translategemma-27b-it",
    "hf_org": "Google",
    "hf_repo": "translategemma-27b-it",
    "title": "TranslateGemma 27B IT",
    "provider": "Google",
    "description": "Lightweight open translation model from Google (based on Gemma 3) supporting 55 languages. Served via the vLLM-optimized Infomaniak-AI checkpoint.",
    "date_updated": "2026-04-17",
    "difficulty": "beginner",
    "min_vllm_version": "0.14.1",
    "architecture": "dense",
    "parameter_count": "27B",
    "active_parameters": "27B",
    "context_length": 131072,
    "tasks": [
      "text"
    ],
    "performance_headline": "Deployable on laptops/desktops and cloud GPUs; vLLM-optimized checkpoint removes custom JSON inputs",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 65
      },
      "vllm_optimized": {
        "precision": "bf16",
        "vram_minimum_gb": 65
      },
      "small_4b": {
        "precision": "bf16",
        "vram_minimum_gb": 10
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp"
    ],
    "features": [],
    "opt_in_features": [],
    "url": "/Google/translategemma-27b-it",
    "json": "/Google/translategemma-27b-it.json"
  },
  {
    "hf_id": "MiniMaxAI/MiniMax-M2.1",
    "hf_org": "MiniMaxAI",
    "hf_repo": "MiniMax-M2.1",
    "title": "MiniMax-M2.1",
    "provider": "MiniMax",
    "description": "MiniMax M2.1 MoE language model (230B total / 10B active) for coding, agent toolchains, and long-context reasoning — native FP8 checkpoint",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.11.0",
    "architecture": "moe",
    "parameter_count": "230B",
    "active_parameters": "10B",
    "context_length": 196608,
    "tasks": [
      "text"
    ],
    "performance_headline": "Updated M2 series MoE with strong SWE-Bench and Terminal-Bench performance, 196K context",
    "variants": {
      "default": {
        "precision": "fp8",
        "vram_minimum_gb": 276
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "single_node_dep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_tep",
      "multi_node_dep"
    ],
    "features": [
      "tool_calling",
      "reasoning"
    ],
    "opt_in_features": [],
    "url": "/MiniMaxAI/MiniMax-M2.1",
    "json": "/MiniMaxAI/MiniMax-M2.1.json"
  },
  {
    "hf_id": "MiniMaxAI/MiniMax-M2.5",
    "hf_org": "MiniMaxAI",
    "hf_repo": "MiniMax-M2.5",
    "title": "MiniMax-M2.5",
    "provider": "MiniMax",
    "description": "MiniMax M2.5 MoE language model (230B total / 10B active) for coding, agent toolchains, and long-context reasoning — native FP8 checkpoint",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.19.0",
    "architecture": "moe",
    "parameter_count": "230B",
    "active_parameters": "10B",
    "context_length": 196608,
    "tasks": [
      "text"
    ],
    "performance_headline": "Refreshed M2 series MoE with strong SWE-Bench and Terminal-Bench performance, 196K context",
    "variants": {
      "default": {
        "precision": "fp8",
        "vram_minimum_gb": 276
      },
      "nvfp4": {
        "precision": "nvfp4",
        "vram_minimum_gb": 138
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "single_node_dep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_tep",
      "multi_node_dep"
    ],
    "features": [
      "tool_calling",
      "reasoning"
    ],
    "opt_in_features": [],
    "url": "/MiniMaxAI/MiniMax-M2.5",
    "json": "/MiniMaxAI/MiniMax-M2.5.json"
  },
  {
    "hf_id": "MiniMaxAI/MiniMax-M2.7",
    "hf_org": "MiniMaxAI",
    "hf_repo": "MiniMax-M2.7",
    "title": "MiniMax-M2.7",
    "provider": "MiniMax",
    "description": "MiniMax M2.7 MoE language model (230B total / 10B active) — latest M2 release for coding, agent toolchains, and long-context reasoning with native FP8",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.20.0",
    "architecture": "moe",
    "parameter_count": "230B",
    "active_parameters": "10B",
    "context_length": 196608,
    "tasks": [
      "text"
    ],
    "performance_headline": "Latest M2 series release; verified accuracy on AIME25, GPQA-D, GSM8K; 196K context",
    "variants": {
      "default": {
        "precision": "fp8",
        "vram_minimum_gb": 276
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "single_node_dep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_tep",
      "multi_node_dep"
    ],
    "features": [
      "tool_calling",
      "reasoning"
    ],
    "opt_in_features": [],
    "url": "/MiniMaxAI/MiniMax-M2.7",
    "json": "/MiniMaxAI/MiniMax-M2.7.json"
  },
  {
    "hf_id": "MiniMaxAI/MiniMax-M2",
    "hf_org": "MiniMaxAI",
    "hf_repo": "MiniMax-M2",
    "title": "MiniMax-M2",
    "provider": "MiniMax",
    "description": "MiniMax M2 MoE language model (230B total / 10B active) for coding, agent toolchains, and long-context reasoning — native FP8 checkpoint",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.11.0",
    "architecture": "moe",
    "parameter_count": "230B",
    "active_parameters": "10B",
    "context_length": 196608,
    "tasks": [
      "text"
    ],
    "performance_headline": "Open-source MoE with strong SWE-Bench and Terminal-Bench performance, 196K context",
    "variants": {
      "default": {
        "precision": "fp8",
        "vram_minimum_gb": 276
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "single_node_dep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_tep",
      "multi_node_dep"
    ],
    "features": [
      "tool_calling",
      "reasoning"
    ],
    "opt_in_features": [],
    "url": "/MiniMaxAI/MiniMax-M2",
    "json": "/MiniMaxAI/MiniMax-M2.json"
  },
  {
    "hf_id": "OpenGVLab/InternVL3_5-8B",
    "hf_org": "OpenGVLab",
    "hf_repo": "InternVL3_5-8B",
    "title": "InternVL3.5",
    "provider": "InternVL (OpenGVLab)",
    "description": "InternVL 3.5 vision-language models from Shanghai AI Lab with thinking-mode prompting",
    "date_updated": "2026-04-17",
    "difficulty": "beginner",
    "min_vllm_version": "0.10.0",
    "architecture": "dense",
    "parameter_count": "8B",
    "active_parameters": "8B",
    "context_length": 40960,
    "tasks": [
      "multimodal"
    ],
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 19
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp"
    ],
    "features": [],
    "opt_in_features": [],
    "url": "/OpenGVLab/InternVL3_5-8B",
    "json": "/OpenGVLab/InternVL3_5-8B.json"
  },
  {
    "hf_id": "PaddlePaddle/PaddleOCR-VL",
    "hf_org": "PaddlePaddle",
    "hf_repo": "PaddleOCR-VL",
    "title": "PaddleOCR-VL",
    "provider": "PaddlePaddle",
    "description": "PaddleOCR-VL (0.9B) — compact vision-language model for document parsing, OCR, tables, formulas, charts",
    "date_updated": "2026-04-17",
    "difficulty": "beginner",
    "min_vllm_version": "0.11.1",
    "architecture": "dense",
    "parameter_count": "0.9B",
    "active_parameters": "0.9B",
    "context_length": 131072,
    "tasks": [
      "multimodal"
    ],
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 2
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp"
    ],
    "features": [],
    "opt_in_features": [],
    "url": "/PaddlePaddle/PaddleOCR-VL",
    "json": "/PaddlePaddle/PaddleOCR-VL.json"
  },
  {
    "hf_id": "Qwen/Qwen-Image",
    "hf_org": "Qwen",
    "hf_repo": "Qwen-Image",
    "title": "Qwen-Image",
    "provider": "Qwen",
    "description": "Text-to-image diffusion model (20B parameters) from the Qwen-Image family, served via vLLM-Omni.",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.18.0",
    "architecture": "dense",
    "parameter_count": "20B",
    "active_parameters": "20B",
    "context_length": 0,
    "tasks": [
      "omni"
    ],
    "performance_headline": "Shared DiT core across T2I, image editing, and layered-image variants; accelerated via Cache-DiT, TeaCache, and sequence parallelism",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 48
      },
      "fp8": {
        "precision": "fp8",
        "vram_minimum_gb": 24
      },
      "int8": {
        "precision": "int8",
        "vram_minimum_gb": 24
      }
    },
    "compatible_strategies": [
      "single_node_tp"
    ],
    "features": [],
    "opt_in_features": [],
    "url": "/Qwen/Qwen-Image",
    "json": "/Qwen/Qwen-Image.json"
  },
  {
    "hf_id": "Qwen/Qwen2.5-VL-72B-Instruct",
    "hf_org": "Qwen",
    "hf_repo": "Qwen2.5-VL-72B-Instruct",
    "title": "Qwen2.5-VL-72B-Instruct",
    "provider": "Qwen",
    "description": "Qwen2.5-VL dense vision-language model (72B) for high-quality image and video understanding.",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.7.0",
    "architecture": "dense",
    "parameter_count": "72B",
    "active_parameters": "72B",
    "context_length": 128000,
    "tasks": [
      "multimodal",
      "text"
    ],
    "performance_headline": "Verified on 4x A100 and 4x MI300X/MI325X/MI355X with BF16",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 173
      },
      "awq": {
        "precision": "int4",
        "vram_minimum_gb": 43
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp"
    ],
    "features": [],
    "opt_in_features": [],
    "url": "/Qwen/Qwen2.5-VL-72B-Instruct",
    "json": "/Qwen/Qwen2.5-VL-72B-Instruct.json"
  },
  {
    "hf_id": "Qwen/Qwen3-235B-A22B-Instruct-2507",
    "hf_org": "Qwen",
    "hf_repo": "Qwen3-235B-A22B-Instruct-2507",
    "title": "Qwen3-235B-A22B-Instruct",
    "provider": "Qwen",
    "description": "Flagship Qwen3 MoE instruct model with 235B total and 22B active parameters, tuned for high-quality text generation.",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.10.0",
    "architecture": "moe",
    "parameter_count": "235B",
    "active_parameters": "22B",
    "context_length": 262144,
    "tasks": [
      "text"
    ],
    "performance_headline": "Verified on 4x/8x H200, MI300X/MI325X/MI355X nodes (BF16 and FP8)",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 564
      },
      "fp8": {
        "precision": "fp8",
        "vram_minimum_gb": 240
      },
      "nvfp4": {
        "precision": "nvfp4",
        "vram_minimum_gb": 141
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "single_node_dep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_dep",
      "multi_node_tep",
      "pd_cluster"
    ],
    "features": [
      "tool_calling"
    ],
    "opt_in_features": [],
    "url": "/Qwen/Qwen3-235B-A22B-Instruct-2507",
    "json": "/Qwen/Qwen3-235B-A22B-Instruct-2507.json"
  },
  {
    "hf_id": "Qwen/Qwen3-ASR-1.7B",
    "hf_org": "Qwen",
    "hf_repo": "Qwen3-ASR-1.7B",
    "title": "Qwen3-ASR-1.7B",
    "provider": "Qwen",
    "description": "Speech-to-text model supporting 11 languages, multiple accents, and singing voice with customizable text-context prompting.",
    "date_updated": "2026-04-17",
    "difficulty": "beginner",
    "min_vllm_version": "0.12.0",
    "architecture": "dense",
    "parameter_count": "2.3B",
    "active_parameters": "2.3B",
    "context_length": 65536,
    "tasks": [
      "multimodal"
    ],
    "performance_headline": "Accurate multilingual ASR, including singing voice; single-GPU serving",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 4
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp"
    ],
    "features": [],
    "opt_in_features": [],
    "url": "/Qwen/Qwen3-ASR-1.7B",
    "json": "/Qwen/Qwen3-ASR-1.7B.json"
  },
  {
    "hf_id": "Qwen/Qwen3-Coder-480B-A35B-Instruct",
    "hf_org": "Qwen",
    "hf_repo": "Qwen3-Coder-480B-A35B-Instruct",
    "title": "Qwen3-Coder-480B-A35B-Instruct",
    "provider": "Qwen",
    "description": "Large coder MoE with 480B total / 35B active parameters, strong tool-use and code generation capabilities.",
    "date_updated": "2026-04-17",
    "difficulty": "advanced",
    "min_vllm_version": "0.10.0",
    "architecture": "moe",
    "parameter_count": "480B",
    "active_parameters": "35B",
    "context_length": 262144,
    "tasks": [
      "text"
    ],
    "performance_headline": "HumanEval 0.939, MBPP 0.918 (FP8). Recommended FP8 on 8x H200/H20 via DP=8",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 1152
      },
      "fp8": {
        "precision": "fp8",
        "vram_minimum_gb": 576
      },
      "nvfp4": {
        "precision": "nvfp4",
        "vram_minimum_gb": 288
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "single_node_dep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_dep",
      "multi_node_tep",
      "pd_cluster"
    ],
    "features": [
      "tool_calling"
    ],
    "opt_in_features": [],
    "url": "/Qwen/Qwen3-Coder-480B-A35B-Instruct",
    "json": "/Qwen/Qwen3-Coder-480B-A35B-Instruct.json"
  },
  {
    "hf_id": "Qwen/Qwen3-Next-80B-A3B-Instruct",
    "hf_org": "Qwen",
    "hf_repo": "Qwen3-Next-80B-A3B-Instruct",
    "title": "Qwen3-Next-80B-A3B-Instruct",
    "provider": "Qwen",
    "description": "Advanced Qwen3-Next MoE model (80B total / 3B active) with hybrid attention, highly sparse experts, and multi-token prediction.",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.10.0",
    "architecture": "moe",
    "parameter_count": "80B",
    "active_parameters": "3B",
    "context_length": 262144,
    "tasks": [
      "text"
    ],
    "performance_headline": "Highly sparse MoE with MTP-accelerated decoding, runs on 4x H200/H20/A100/A800",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 192
      },
      "fp8": {
        "precision": "fp8",
        "vram_minimum_gb": 96
      },
      "nvfp4": {
        "precision": "nvfp4",
        "vram_minimum_gb": 48
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "single_node_dep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_dep",
      "multi_node_tep"
    ],
    "features": [
      "tool_calling",
      "spec_decoding"
    ],
    "opt_in_features": [
      "spec_decoding"
    ],
    "url": "/Qwen/Qwen3-Next-80B-A3B-Instruct",
    "json": "/Qwen/Qwen3-Next-80B-A3B-Instruct.json"
  },
  {
    "hf_id": "Qwen/Qwen3-VL-235B-A22B-Instruct",
    "hf_org": "Qwen",
    "hf_repo": "Qwen3-VL-235B-A22B-Instruct",
    "title": "Qwen3-VL-235B-A22B-Instruct",
    "provider": "Qwen",
    "description": "Qwen3-VL flagship MoE vision-language model with 235B total / 22B active parameters, supporting images, video, and long context.",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.11.0",
    "architecture": "moe",
    "parameter_count": "235B",
    "active_parameters": "22B",
    "context_length": 262144,
    "tasks": [
      "multimodal",
      "text"
    ],
    "performance_headline": "Strong on images, video, and text — #1 open model on text on lmarena.ai at release",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 564
      },
      "fp8": {
        "precision": "fp8",
        "vram_minimum_gb": 282
      },
      "nvfp4": {
        "precision": "nvfp4",
        "vram_minimum_gb": 141
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "single_node_dep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_dep",
      "multi_node_tep",
      "pd_cluster"
    ],
    "features": [],
    "opt_in_features": [],
    "url": "/Qwen/Qwen3-VL-235B-A22B-Instruct",
    "json": "/Qwen/Qwen3-VL-235B-A22B-Instruct.json"
  },
  {
    "hf_id": "Qwen/Qwen3.5-397B-A17B",
    "hf_org": "Qwen",
    "hf_repo": "Qwen3.5-397B-A17B",
    "title": "Qwen3.5-397B",
    "provider": "Qwen",
    "description": "Multimodal MoE model with gated delta networks architecture, 397B total / 17B active parameters, up to 262K context",
    "date_updated": "2026-04-16",
    "difficulty": "intermediate",
    "min_vllm_version": "0.17.0",
    "architecture": "moe",
    "parameter_count": "397B",
    "active_parameters": "17B",
    "context_length": 262144,
    "tasks": [
      "text"
    ],
    "performance_headline": "Verified on 8x H200, 8x MI300X/MI355X, and GB200 nodes",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 953
      },
      "nvfp4": {
        "precision": "nvfp4",
        "vram_minimum_gb": 238
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "single_node_dep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_dep",
      "multi_node_tep",
      "pd_cluster"
    ],
    "features": [
      "tool_calling",
      "reasoning",
      "spec_decoding"
    ],
    "opt_in_features": [
      "spec_decoding"
    ],
    "url": "/Qwen/Qwen3.5-397B-A17B",
    "json": "/Qwen/Qwen3.5-397B-A17B.json"
  },
  {
    "hf_id": "Qwen/Qwen3.6-35B-A3B",
    "hf_org": "Qwen",
    "hf_repo": "Qwen3.6-35B-A3B",
    "title": "Qwen3.6-35B-A3B",
    "provider": "Qwen",
    "description": "Smaller Qwen3.6 multimodal MoE model (35B total / 3B active) with 256 experts (8 routed + 1 shared), gated delta networks architecture, and 262K context",
    "date_updated": "2026-04-18",
    "difficulty": "beginner",
    "min_vllm_version": "0.17.0",
    "architecture": "moe",
    "parameter_count": "35B",
    "active_parameters": "3B",
    "context_length": 262144,
    "tasks": [
      "text"
    ],
    "performance_headline": "Compact Qwen3.6 MoE with 3B active parameters — single-GPU FP8 or 2-4 GPU BF16 serving",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 84
      },
      "fp8": {
        "precision": "fp8",
        "vram_minimum_gb": 42
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "single_node_dep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_dep",
      "multi_node_tep"
    ],
    "features": [
      "tool_calling",
      "reasoning",
      "spec_decoding"
    ],
    "opt_in_features": [
      "spec_decoding"
    ],
    "url": "/Qwen/Qwen3.6-35B-A3B",
    "json": "/Qwen/Qwen3.6-35B-A3B.json"
  },
  {
    "hf_id": "Qwen/Qwen3Guard-Gen-8B",
    "hf_org": "Qwen",
    "hf_repo": "Qwen3Guard-Gen-8B",
    "title": "Qwen3Guard-Gen-8B",
    "provider": "Qwen",
    "description": "Lightweight text-only guardrail/safety classifier model in the Qwen3Guard family.",
    "date_updated": "2026-04-17",
    "difficulty": "beginner",
    "min_vllm_version": "0.10.0",
    "architecture": "dense",
    "parameter_count": "8B",
    "active_parameters": "8B",
    "context_length": 32768,
    "tasks": [
      "text"
    ],
    "performance_headline": "Runs on a single GPU; serves safety classifications over OpenAI-compatible API",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 19
      },
      "small_4b": {
        "precision": "bf16",
        "vram_minimum_gb": 10
      },
      "tiny_0_6b": {
        "precision": "bf16",
        "vram_minimum_gb": 4
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp"
    ],
    "features": [],
    "opt_in_features": [],
    "url": "/Qwen/Qwen3Guard-Gen-8B",
    "json": "/Qwen/Qwen3Guard-Gen-8B.json"
  },
  {
    "hf_id": "Wan-AI/Wan2.2-T2V-A14B-Diffusers",
    "hf_org": "Wan-AI",
    "hf_repo": "Wan2.2-T2V-A14B-Diffusers",
    "title": "Wan2.2",
    "provider": "Wan AI",
    "description": "Wan2.2 video generation models — T2V/I2V MoE (14B active) and unified TI2V (5B dense), served via vLLM-Omni",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.12.0",
    "architecture": "moe",
    "parameter_count": "28B",
    "active_parameters": "14B",
    "context_length": 0,
    "tasks": [
      "omni"
    ],
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 152
      },
      "i2v": {
        "precision": "bf16",
        "vram_minimum_gb": 40
      },
      "ti2v_5b": {
        "precision": "bf16",
        "vram_minimum_gb": 20
      }
    },
    "compatible_strategies": [],
    "features": [],
    "opt_in_features": [],
    "url": "/Wan-AI/Wan2.2-T2V-A14B-Diffusers",
    "json": "/Wan-AI/Wan2.2-T2V-A14B-Diffusers.json"
  },
  {
    "hf_id": "XiaomiMiMo/MiMo-V2-Flash",
    "hf_org": "XiaomiMiMo",
    "hf_repo": "MiMo-V2-Flash",
    "title": "MiMo-V2-Flash",
    "provider": "Xiaomi MiMo",
    "description": "Xiaomi's MoE reasoning model (309B total / 15B active) with hybrid attention and MTP for fast agentic workflows",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.11.0",
    "architecture": "moe",
    "parameter_count": "309B",
    "active_parameters": "15B",
    "context_length": 262144,
    "tasks": [
      "text"
    ],
    "variants": {
      "default": {
        "precision": "fp8",
        "vram_minimum_gb": 371
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_tep",
      "multi_node_dep"
    ],
    "features": [
      "tool_calling",
      "reasoning"
    ],
    "opt_in_features": [],
    "url": "/XiaomiMiMo/MiMo-V2-Flash",
    "json": "/XiaomiMiMo/MiMo-V2-Flash.json"
  },
  {
    "hf_id": "arcee-ai/Trinity-Large-Thinking",
    "hf_org": "arcee-ai",
    "hf_repo": "Trinity-Large-Thinking",
    "title": "Trinity-Large-Thinking",
    "provider": "Arcee AI",
    "description": "Arcee AI's reasoning-focused sparse MoE (AfmoeForCausalLM) with structured <think> traces and agentic tool use",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.11.1",
    "architecture": "moe",
    "parameter_count": "398B",
    "active_parameters": "13B",
    "context_length": 262144,
    "tasks": [
      "text"
    ],
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 955
      },
      "nvfp4": {
        "precision": "nvfp4",
        "vram_minimum_gb": 239
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_tep",
      "multi_node_dep"
    ],
    "features": [
      "tool_calling",
      "reasoning"
    ],
    "opt_in_features": [],
    "url": "/arcee-ai/Trinity-Large-Thinking",
    "json": "/arcee-ai/Trinity-Large-Thinking.json"
  },
  {
    "hf_id": "baidu/ERNIE-4.5-21B-A3B-PT",
    "hf_org": "baidu",
    "hf_repo": "ERNIE-4.5-21B-A3B-PT",
    "title": "ERNIE-4.5",
    "provider": "Ernie (Baidu)",
    "description": "Baidu ERNIE 4.5 MoE text models (21B-A3B, 300B-A47B) with BF16 and FP8 support plus ERNIE-MTP speculative decoding",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.10.1",
    "architecture": "moe",
    "parameter_count": "21B",
    "active_parameters": "3B",
    "context_length": 131072,
    "tasks": [
      "text"
    ],
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 106
      },
      "300b": {
        "precision": "bf16",
        "vram_minimum_gb": 640
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_tep",
      "multi_node_dep"
    ],
    "features": [
      "spec_decoding"
    ],
    "opt_in_features": [
      "spec_decoding"
    ],
    "url": "/baidu/ERNIE-4.5-21B-A3B-PT",
    "json": "/baidu/ERNIE-4.5-21B-A3B-PT.json"
  },
  {
    "hf_id": "baidu/ERNIE-4.5-VL-28B-A3B-PT",
    "hf_org": "baidu",
    "hf_repo": "ERNIE-4.5-VL-28B-A3B-PT",
    "title": "ERNIE-4.5-VL",
    "provider": "Ernie (Baidu)",
    "description": "Baidu ERNIE 4.5 VL MoE vision-language models (28B-A3B, 424B-A47B) with heterogeneous text/vision experts",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.11.0",
    "architecture": "moe",
    "parameter_count": "28B",
    "active_parameters": "3B",
    "context_length": 131072,
    "tasks": [
      "multimodal"
    ],
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 67
      },
      "424b": {
        "precision": "bf16",
        "vram_minimum_gb": 1120
      },
      "424b_fp8": {
        "precision": "fp8",
        "vram_minimum_gb": 640
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_tep",
      "multi_node_dep"
    ],
    "features": [],
    "opt_in_features": [],
    "url": "/baidu/ERNIE-4.5-VL-28B-A3B-PT",
    "json": "/baidu/ERNIE-4.5-VL-28B-A3B-PT.json"
  },
  {
    "hf_id": "deepseek-ai/DeepSeek-OCR-2",
    "hf_org": "deepseek-ai",
    "hf_repo": "DeepSeek-OCR-2",
    "title": "DeepSeek-OCR-2",
    "provider": "DeepSeek",
    "description": "Next-generation DeepSeek OCR model with improved document-to-markdown grounding and optical context compression.",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.12.0",
    "architecture": "moe",
    "parameter_count": "3B",
    "active_parameters": "3B",
    "context_length": 8192,
    "tasks": [
      "multimodal"
    ],
    "performance_headline": "Improved grounding and markdown conversion over DeepSeek-OCR",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 7
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_tep",
      "multi_node_dep"
    ],
    "features": [],
    "opt_in_features": [],
    "url": "/deepseek-ai/DeepSeek-OCR-2",
    "json": "/deepseek-ai/DeepSeek-OCR-2.json"
  },
  {
    "hf_id": "deepseek-ai/DeepSeek-OCR",
    "hf_org": "deepseek-ai",
    "hf_repo": "DeepSeek-OCR",
    "title": "DeepSeek-OCR",
    "provider": "DeepSeek",
    "description": "Frontier OCR model exploring optical context compression for LLMs, optimized for document parsing and markdown generation.",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.12.0",
    "architecture": "moe",
    "parameter_count": "3B",
    "active_parameters": "3B",
    "context_length": 8192,
    "tasks": [
      "multimodal"
    ],
    "performance_headline": "Optical context compression for efficient OCR and document understanding",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 7
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_tep",
      "multi_node_dep"
    ],
    "features": [],
    "opt_in_features": [],
    "url": "/deepseek-ai/DeepSeek-OCR",
    "json": "/deepseek-ai/DeepSeek-OCR.json"
  },
  {
    "hf_id": "deepseek-ai/DeepSeek-R1",
    "hf_org": "deepseek-ai",
    "hf_repo": "DeepSeek-R1",
    "title": "DeepSeek-R1",
    "provider": "DeepSeek",
    "description": "DeepSeek-R1 is a 671B-parameter MoE reasoning model built on the DeepSeek-V3 architecture, trained with large-scale reinforcement learning for strong chain-of-thought capabilities.",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.12.0",
    "architecture": "moe",
    "parameter_count": "671B",
    "active_parameters": "37B",
    "context_length": 163840,
    "tasks": [
      "text"
    ],
    "performance_headline": "Open-weights RL-trained reasoning model with native FP8 / FP4 variants",
    "variants": {
      "default": {
        "precision": "fp8",
        "vram_minimum_gb": 805
      },
      "r1_0528": {
        "precision": "fp8",
        "vram_minimum_gb": 805
      },
      "nvfp4": {
        "precision": "nvfp4",
        "vram_minimum_gb": 403
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "single_node_dep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_dep",
      "multi_node_tep",
      "pd_cluster"
    ],
    "features": [],
    "opt_in_features": [],
    "url": "/deepseek-ai/DeepSeek-R1",
    "json": "/deepseek-ai/DeepSeek-R1.json"
  },
  {
    "hf_id": "deepseek-ai/DeepSeek-V3.1",
    "hf_org": "deepseek-ai",
    "hf_repo": "DeepSeek-V3.1",
    "title": "DeepSeek-V3.1",
    "provider": "DeepSeek",
    "description": "DeepSeek-V3.1 is a hybrid MoE model that supports dynamic switching between thinking and non-thinking modes, with tool calling and function execution.",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.12.0",
    "architecture": "moe",
    "parameter_count": "671B",
    "active_parameters": "37B",
    "context_length": 163840,
    "tasks": [
      "text"
    ],
    "performance_headline": "Hybrid thinking / non-thinking MoE with native FP8 and tool calling",
    "variants": {
      "default": {
        "precision": "fp8",
        "vram_minimum_gb": 805
      },
      "nvfp4": {
        "precision": "nvfp4",
        "vram_minimum_gb": 403
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "single_node_dep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_dep",
      "multi_node_tep",
      "pd_cluster"
    ],
    "features": [
      "tool_calling",
      "reasoning"
    ],
    "opt_in_features": [],
    "url": "/deepseek-ai/DeepSeek-V3.1",
    "json": "/deepseek-ai/DeepSeek-V3.1.json"
  },
  {
    "hf_id": "deepseek-ai/DeepSeek-V3.2-Exp",
    "hf_org": "deepseek-ai",
    "hf_repo": "DeepSeek-V3.2-Exp",
    "title": "DeepSeek-V3.2-Exp",
    "provider": "DeepSeek",
    "description": "Experimental DeepSeek-V3.2 preview with sparse attention (MQA-like logits) and FP8 KV cache; architecture matches DeepSeek-V3.1 except for the sparse attention mechanism.",
    "date_updated": "2026-04-17",
    "difficulty": "advanced",
    "min_vllm_version": "0.12.0",
    "architecture": "moe",
    "parameter_count": "671B",
    "active_parameters": "37B",
    "context_length": 163840,
    "tasks": [
      "text"
    ],
    "performance_headline": "Sparse attention MoE with FP8 KV cache and strong GSM8K score (~0.96)",
    "variants": {
      "default": {
        "precision": "fp8",
        "vram_minimum_gb": 805
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "single_node_dep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_dep",
      "multi_node_tep"
    ],
    "features": [
      "reasoning"
    ],
    "opt_in_features": [],
    "url": "/deepseek-ai/DeepSeek-V3.2-Exp",
    "json": "/deepseek-ai/DeepSeek-V3.2-Exp.json"
  },
  {
    "hf_id": "deepseek-ai/DeepSeek-V3.2",
    "hf_org": "deepseek-ai",
    "hf_repo": "DeepSeek-V3.2",
    "title": "DeepSeek-V3.2",
    "provider": "DeepSeek",
    "description": "DeepSeek V3.2 MoE model with MLA attention, sparse attention, and scalable RL for strong reasoning and agent capabilities.",
    "date_updated": "2026-04-01",
    "difficulty": "intermediate",
    "min_vllm_version": "0.18.0",
    "architecture": "moe",
    "parameter_count": "671B",
    "active_parameters": "37B",
    "context_length": 163840,
    "tasks": [
      "text"
    ],
    "performance_headline": "GPT-5-level reasoning with efficient MoE inference",
    "variants": {
      "default": {
        "precision": "fp8",
        "vram_minimum_gb": 805
      },
      "nvfp4": {
        "precision": "nvfp4",
        "vram_minimum_gb": 403
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "single_node_dep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_dep",
      "multi_node_tep",
      "pd_cluster"
    ],
    "features": [
      "tool_calling",
      "reasoning",
      "spec_decoding"
    ],
    "opt_in_features": [
      "spec_decoding"
    ],
    "url": "/deepseek-ai/DeepSeek-V3.2",
    "json": "/deepseek-ai/DeepSeek-V3.2.json"
  },
  {
    "hf_id": "deepseek-ai/DeepSeek-V3",
    "hf_org": "deepseek-ai",
    "hf_repo": "DeepSeek-V3",
    "title": "DeepSeek-V3",
    "provider": "DeepSeek",
    "description": "DeepSeek-V3 is a 671B-parameter Mixture-of-Experts model with native FP8 weights and strong reasoning, coding, and math capabilities.",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.12.0",
    "architecture": "moe",
    "parameter_count": "671B",
    "active_parameters": "37B",
    "context_length": 163840,
    "tasks": [
      "text"
    ],
    "performance_headline": "Frontier open-weights MoE with native FP8 and FP4 variants",
    "variants": {
      "default": {
        "precision": "fp8",
        "vram_minimum_gb": 805
      },
      "fp4": {
        "precision": "fp4",
        "vram_minimum_gb": 403
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "single_node_dep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_dep",
      "multi_node_tep",
      "pd_cluster"
    ],
    "features": [],
    "opt_in_features": [],
    "url": "/deepseek-ai/DeepSeek-V3",
    "json": "/deepseek-ai/DeepSeek-V3.json"
  },
  {
    "hf_id": "inclusionAI/Ring-1T-FP8",
    "hf_org": "inclusionAI",
    "hf_repo": "Ring-1T-FP8",
    "title": "Ring-1T-FP8",
    "provider": "inclusionAI",
    "description": "Ring-1T (BailingMoeV2) FP8 model (~1T total params) for 8xH200 or 8xMI300X deployment",
    "date_updated": "2026-04-17",
    "difficulty": "advanced",
    "min_vllm_version": "0.11.0",
    "architecture": "moe",
    "parameter_count": "1T",
    "active_parameters": "50B",
    "context_length": 65536,
    "tasks": [
      "text"
    ],
    "variants": {
      "default": {
        "precision": "fp8",
        "vram_minimum_gb": 1200
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_tep",
      "multi_node_dep"
    ],
    "features": [],
    "opt_in_features": [],
    "url": "/inclusionAI/Ring-1T-FP8",
    "json": "/inclusionAI/Ring-1T-FP8.json"
  },
  {
    "hf_id": "internlm/Intern-S1",
    "hf_org": "internlm",
    "hf_repo": "Intern-S1",
    "title": "Intern-S1",
    "provider": "InternLM",
    "description": "Intern-S1 vision-language model from Shanghai AI Lab with BF16/FP8 variants and thinking/non-thinking modes",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.10.0",
    "architecture": "moe",
    "parameter_count": "241B",
    "active_parameters": "28B",
    "context_length": 65536,
    "tasks": [
      "multimodal"
    ],
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 578
      },
      "fp8": {
        "precision": "fp8",
        "vram_minimum_gb": 289
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_tep",
      "multi_node_dep"
    ],
    "features": [
      "tool_calling",
      "reasoning"
    ],
    "opt_in_features": [],
    "url": "/internlm/Intern-S1",
    "json": "/internlm/Intern-S1.json"
  },
  {
    "hf_id": "jinaai/jina-reranker-m0",
    "hf_org": "jinaai",
    "hf_repo": "jina-reranker-m0",
    "title": "Jina Reranker m0",
    "provider": "Jina AI",
    "description": "Multilingual, multimodal reranker for text and visual documents across 29+ languages via Qwen2-VL backbone",
    "date_updated": "2026-04-17",
    "difficulty": "beginner",
    "min_vllm_version": "0.8.0",
    "architecture": "dense",
    "parameter_count": "2.4B",
    "active_parameters": "2.4B",
    "context_length": 32768,
    "tasks": [
      "embedding"
    ],
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 6
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp"
    ],
    "features": [],
    "opt_in_features": [],
    "url": "/jinaai/jina-reranker-m0",
    "json": "/jinaai/jina-reranker-m0.json"
  },
  {
    "hf_id": "meituan-longcat/LongCat-Image-Edit",
    "hf_org": "meituan-longcat",
    "hf_repo": "LongCat-Image-Edit",
    "title": "LongCat-Image-Edit",
    "provider": "Meituan",
    "description": "Bilingual (Chinese-English) image editing model from Meituan LongCat, served via vLLM-Omni",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.12.0",
    "architecture": "dense",
    "parameter_count": "6B",
    "active_parameters": "6B",
    "context_length": 0,
    "tasks": [
      "omni"
    ],
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 36
      }
    },
    "compatible_strategies": [],
    "features": [],
    "opt_in_features": [],
    "url": "/meituan-longcat/LongCat-Image-Edit",
    "json": "/meituan-longcat/LongCat-Image-Edit.json"
  },
  {
    "hf_id": "meta-llama/Llama-3.1-8B-Instruct",
    "hf_org": "meta-llama",
    "hf_repo": "Llama-3.1-8B-Instruct",
    "title": "Llama-3.1-8B-Instruct",
    "provider": "Meta",
    "description": "Meta's Llama 3.1 8B dense instruction-tuned language model with 128K context",
    "date_updated": "2026-04-17",
    "difficulty": "beginner",
    "min_vllm_version": "0.6.0",
    "architecture": "dense",
    "parameter_count": "8B",
    "active_parameters": "8B",
    "context_length": 131072,
    "tasks": [
      "text"
    ],
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 20
      },
      "nvfp4": {
        "precision": "nvfp4",
        "vram_minimum_gb": 5
      },
      "nvidia_fp8": {
        "precision": "fp8",
        "vram_minimum_gb": 10
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp"
    ],
    "features": [],
    "opt_in_features": [],
    "url": "/meta-llama/Llama-3.1-8B-Instruct",
    "json": "/meta-llama/Llama-3.1-8B-Instruct.json"
  },
  {
    "hf_id": "meta-llama/Llama-3.3-70B-Instruct",
    "hf_org": "meta-llama",
    "hf_repo": "Llama-3.3-70B-Instruct",
    "title": "Llama-3.3-70B",
    "provider": "Meta",
    "description": "Llama 3.3 70B dense model with NVIDIA FP8/FP4 quantized variants for Hopper and Blackwell GPUs",
    "date_updated": "2026-04-16",
    "difficulty": "beginner",
    "min_vllm_version": "0.12.0",
    "architecture": "dense",
    "parameter_count": "70B",
    "active_parameters": "70B",
    "context_length": 131072,
    "tasks": [
      "text"
    ],
    "performance_headline": "",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 170
      },
      "fp8": {
        "precision": "fp8",
        "vram_minimum_gb": 84
      },
      "nvfp4": {
        "precision": "nvfp4",
        "vram_minimum_gb": 42
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp"
    ],
    "features": [],
    "opt_in_features": [],
    "url": "/meta-llama/Llama-3.3-70B-Instruct",
    "json": "/meta-llama/Llama-3.3-70B-Instruct.json"
  },
  {
    "hf_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
    "hf_org": "meta-llama",
    "hf_repo": "Llama-4-Scout-17B-16E-Instruct",
    "title": "Llama-4-Scout",
    "provider": "Meta",
    "description": "Llama 4 Scout 17B-16E MoE model with NVIDIA FP8/FP4 variants, fits on a single GPU with quantization",
    "date_updated": "2026-04-16",
    "difficulty": "beginner",
    "min_vllm_version": "0.12.0",
    "architecture": "moe",
    "parameter_count": "109B",
    "active_parameters": "17B",
    "context_length": 10485760,
    "tasks": [
      "text"
    ],
    "performance_headline": "",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 262
      },
      "fp8": {
        "precision": "fp8",
        "vram_minimum_gb": 131
      },
      "nvfp4": {
        "precision": "nvfp4",
        "vram_minimum_gb": 65
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "single_node_dep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_tep",
      "multi_node_dep"
    ],
    "features": [],
    "opt_in_features": [],
    "url": "/meta-llama/Llama-4-Scout-17B-16E-Instruct",
    "json": "/meta-llama/Llama-4-Scout-17B-16E-Instruct.json"
  },
  {
    "hf_id": "microsoft/Phi-4-mini-instruct",
    "hf_org": "microsoft",
    "hf_repo": "Phi-4-mini-instruct",
    "title": "Phi-4",
    "provider": "Microsoft",
    "description": "Microsoft's Phi-4 family of lightweight dense models (mini-instruct, reasoning, multimodal) with 128K context",
    "date_updated": "2026-04-17",
    "difficulty": "beginner",
    "min_vllm_version": "0.7.0",
    "architecture": "dense",
    "parameter_count": "4B",
    "active_parameters": "4B",
    "context_length": 131072,
    "tasks": [
      "text",
      "multimodal"
    ],
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 10
      },
      "mini_reasoning": {
        "precision": "bf16",
        "vram_minimum_gb": 10
      },
      "reasoning": {
        "precision": "bf16",
        "vram_minimum_gb": 30
      },
      "multimodal": {
        "precision": "bf16",
        "vram_minimum_gb": 16
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp"
    ],
    "features": [],
    "opt_in_features": [],
    "url": "/microsoft/Phi-4-mini-instruct",
    "json": "/microsoft/Phi-4-mini-instruct.json"
  },
  {
    "hf_id": "mistralai/Ministral-3-14B-Instruct-2512",
    "hf_org": "mistralai",
    "hf_repo": "Ministral-3-14B-Instruct-2512",
    "title": "Ministral-3-Instruct",
    "provider": "Mistral AI",
    "description": "Ministral 3 Instruct family (3B/8B/14B) with FP8 weights, vision support, and 256K context",
    "date_updated": "2026-04-17",
    "difficulty": "beginner",
    "min_vllm_version": "0.11.0",
    "architecture": "dense",
    "parameter_count": "14B",
    "active_parameters": "14B",
    "context_length": 262144,
    "tasks": [
      "multimodal"
    ],
    "variants": {
      "default": {
        "precision": "fp8",
        "vram_minimum_gb": 17
      },
      "8b": {
        "precision": "fp8",
        "vram_minimum_gb": 12
      },
      "3b": {
        "precision": "fp8",
        "vram_minimum_gb": 6
      },
      "fp8": {
        "precision": "fp8",
        "vram_minimum_gb": 17
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp"
    ],
    "features": [
      "tool_calling"
    ],
    "opt_in_features": [],
    "url": "/mistralai/Ministral-3-14B-Instruct-2512",
    "json": "/mistralai/Ministral-3-14B-Instruct-2512.json"
  },
  {
    "hf_id": "mistralai/Ministral-3-8B-Reasoning-2512",
    "hf_org": "mistralai",
    "hf_repo": "Ministral-3-8B-Reasoning-2512",
    "title": "Ministral-3-Reasoning",
    "provider": "Mistral AI",
    "description": "Ministral 3 Reasoning family (3B/8B/14B) with BF16 weights, vision support, and 256K context",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.11.0",
    "architecture": "dense",
    "parameter_count": "8B",
    "active_parameters": "8B",
    "context_length": 262144,
    "tasks": [
      "multimodal"
    ],
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 22
      },
      "3b": {
        "precision": "bf16",
        "vram_minimum_gb": 8
      },
      "14b": {
        "precision": "bf16",
        "vram_minimum_gb": 32
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp"
    ],
    "features": [
      "tool_calling",
      "reasoning"
    ],
    "opt_in_features": [],
    "url": "/mistralai/Ministral-3-8B-Reasoning-2512",
    "json": "/mistralai/Ministral-3-8B-Reasoning-2512.json"
  },
  {
    "hf_id": "mistralai/Mistral-Large-3-675B-Instruct-2512",
    "hf_org": "mistralai",
    "hf_repo": "Mistral-Large-3-675B-Instruct-2512",
    "title": "Mistral-Large-3-675B-Instruct",
    "provider": "Mistral AI",
    "description": "Mistral Large 3 (675B) with FP8 and NVFP4 weights for 8xH200 / 4xB200 deployments",
    "date_updated": "2026-04-17",
    "difficulty": "advanced",
    "min_vllm_version": "0.11.0",
    "architecture": "moe",
    "parameter_count": "675B",
    "active_parameters": "22B",
    "context_length": 294912,
    "tasks": [
      "multimodal"
    ],
    "variants": {
      "default": {
        "precision": "fp8",
        "vram_minimum_gb": 810
      },
      "nvfp4": {
        "precision": "nvfp4",
        "vram_minimum_gb": 405
      },
      "fp8": {
        "precision": "fp8",
        "vram_minimum_gb": 810
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_tep",
      "multi_node_dep"
    ],
    "features": [
      "tool_calling"
    ],
    "opt_in_features": [],
    "url": "/mistralai/Mistral-Large-3-675B-Instruct-2512",
    "json": "/mistralai/Mistral-Large-3-675B-Instruct-2512.json"
  },
  {
    "hf_id": "moonshotai/Kimi-K2-Instruct",
    "hf_org": "moonshotai",
    "hf_repo": "Kimi-K2-Instruct",
    "title": "Kimi-K2-Instruct",
    "provider": "Moonshot AI",
    "description": "Moonshot AI's Kimi-K2 is a trillion-parameter MoE instruction model (~32B active) with native FP8 weights and strong tool-calling capabilities.",
    "date_updated": "2026-04-17",
    "difficulty": "advanced",
    "min_vllm_version": "0.12.0",
    "architecture": "moe",
    "parameter_count": "1T",
    "active_parameters": "32B",
    "context_length": 131072,
    "tasks": [
      "text"
    ],
    "performance_headline": "Open-weights 1T-parameter MoE with native FP8 and Kimi K2 tool calling",
    "variants": {
      "default": {
        "precision": "fp8",
        "vram_minimum_gb": 1200
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "single_node_dep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_dep",
      "multi_node_tep",
      "pd_cluster"
    ],
    "features": [
      "tool_calling"
    ],
    "opt_in_features": [],
    "url": "/moonshotai/Kimi-K2-Instruct",
    "json": "/moonshotai/Kimi-K2-Instruct.json"
  },
  {
    "hf_id": "moonshotai/Kimi-K2-Thinking",
    "hf_org": "moonshotai",
    "hf_repo": "Kimi-K2-Thinking",
    "title": "Kimi-K2-Thinking",
    "provider": "Moonshot AI",
    "description": "Kimi-K2-Thinking is an advanced reasoning MoE model with native INT4 QAT weights, designed for long-horizon agent workflows interleaving chain-of-thought reasoning with tool calls.",
    "date_updated": "2026-04-17",
    "difficulty": "advanced",
    "min_vllm_version": "0.12.0",
    "architecture": "moe",
    "parameter_count": "1T",
    "active_parameters": "32B",
    "context_length": 262144,
    "tasks": [
      "text"
    ],
    "performance_headline": "1T MoE thinking model with native INT4 QAT for 2x low-latency speed-up",
    "variants": {
      "default": {
        "precision": "int4",
        "vram_minimum_gb": 600
      },
      "nvfp4": {
        "precision": "nvfp4",
        "vram_minimum_gb": 600
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "single_node_dep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_dep",
      "multi_node_tep",
      "pd_cluster"
    ],
    "features": [
      "tool_calling",
      "reasoning"
    ],
    "opt_in_features": [],
    "url": "/moonshotai/Kimi-K2-Thinking",
    "json": "/moonshotai/Kimi-K2-Thinking.json"
  },
  {
    "hf_id": "moonshotai/Kimi-K2.5",
    "hf_org": "moonshotai",
    "hf_repo": "Kimi-K2.5",
    "title": "Kimi-K2.5",
    "provider": "Moonshot AI",
    "description": "Open-source native multimodal agentic MoE model with vision-language understanding, tool calling, and thinking modes",
    "date_updated": "2026-04-16",
    "difficulty": "intermediate",
    "min_vllm_version": "0.15.0",
    "architecture": "moe",
    "parameter_count": "1T",
    "active_parameters": "32B",
    "context_length": 262144,
    "tasks": [
      "multimodal"
    ],
    "performance_headline": "Multimodal agentic MoE model with DeepSeek-V3 backbone and MLA attention",
    "variants": {
      "default": {
        "precision": "int4",
        "vram_minimum_gb": 714
      },
      "nvfp4": {
        "precision": "nvfp4",
        "vram_minimum_gb": 600
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "single_node_dep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_dep",
      "multi_node_tep",
      "pd_cluster"
    ],
    "features": [
      "tool_calling",
      "reasoning",
      "spec_decoding"
    ],
    "opt_in_features": [
      "spec_decoding"
    ],
    "url": "/moonshotai/Kimi-K2.5",
    "json": "/moonshotai/Kimi-K2.5.json"
  },
  {
    "hf_id": "moonshotai/Kimi-Linear-48B-A3B-Instruct",
    "hf_org": "moonshotai",
    "hf_repo": "Kimi-Linear-48B-A3B-Instruct",
    "title": "Kimi-Linear-48B-A3B-Instruct",
    "provider": "Moonshot AI",
    "description": "Kimi-Linear is a 48B-parameter instruction-tuned MoE model (~3B activated) with a linear-attention variant supporting very long context (1M tokens).",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.11.2",
    "architecture": "moe",
    "parameter_count": "48B",
    "active_parameters": "3B",
    "context_length": 1048576,
    "tasks": [
      "text"
    ],
    "performance_headline": "Linear-attention MoE with 1M-token context on a single node",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 115
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_tep",
      "multi_node_dep"
    ],
    "features": [],
    "opt_in_features": [],
    "url": "/moonshotai/Kimi-Linear-48B-A3B-Instruct",
    "json": "/moonshotai/Kimi-Linear-48B-A3B-Instruct.json"
  },
  {
    "hf_id": "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
    "hf_org": "nvidia",
    "hf_repo": "NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
    "title": "NVIDIA Nemotron-3-Nano-30B-A3B",
    "provider": "NVIDIA",
    "description": "NVIDIA Nemotron-3-Nano Mamba-hybrid MoE (30B total / ~3B active) with BF16 and FP8 variants",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.11.2",
    "architecture": "moe",
    "parameter_count": "30B",
    "active_parameters": "3B",
    "context_length": 262144,
    "tasks": [
      "text"
    ],
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 72
      },
      "fp8": {
        "precision": "fp8",
        "vram_minimum_gb": 35
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_tep",
      "multi_node_dep"
    ],
    "features": [
      "tool_calling",
      "reasoning"
    ],
    "opt_in_features": [],
    "url": "/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
    "json": "/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.json"
  },
  {
    "hf_id": "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16",
    "hf_org": "nvidia",
    "hf_repo": "NVIDIA-Nemotron-Nano-12B-v2-VL-BF16",
    "title": "NVIDIA Nemotron-Nano-12B-v2-VL",
    "provider": "NVIDIA",
    "description": "NVIDIA Nemotron-Nano 12B vision-language model with video support and Efficient Video Sampling (EVS)",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.11.1",
    "architecture": "dense",
    "parameter_count": "12B",
    "active_parameters": "12B",
    "context_length": 131072,
    "tasks": [
      "multimodal"
    ],
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 29
      },
      "fp8": {
        "precision": "fp8",
        "vram_minimum_gb": 14
      },
      "nvfp4": {
        "precision": "nvfp4",
        "vram_minimum_gb": 8
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp"
    ],
    "features": [
      "video_compression"
    ],
    "opt_in_features": [],
    "url": "/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16",
    "json": "/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16.json"
  },
  {
    "hf_id": "openai/gpt-oss-120b",
    "hf_org": "openai",
    "hf_repo": "gpt-oss-120b",
    "title": "GPT-OSS",
    "provider": "OpenAI",
    "description": "OpenAI's gpt-oss family (20B / 120B) with MXFP4 MoE, attention-sinks, built-in tools via Responses API",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.10.0",
    "architecture": "moe",
    "parameter_count": "120B",
    "active_parameters": "5.1B",
    "context_length": 131072,
    "tasks": [
      "text"
    ],
    "variants": {
      "default": {
        "precision": "mxfp4",
        "vram_minimum_gb": 96
      },
      "20b": {
        "precision": "mxfp4",
        "vram_minimum_gb": 40
      },
      "amd_fp8": {
        "precision": "mxfp4",
        "vram_minimum_gb": 80
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_tep",
      "multi_node_dep"
    ],
    "features": [
      "tool_calling",
      "spec_decoding"
    ],
    "opt_in_features": [
      "spec_decoding"
    ],
    "url": "/openai/gpt-oss-120b",
    "json": "/openai/gpt-oss-120b.json"
  },
  {
    "hf_id": "stabilityai/stable-audio-open-1.0",
    "hf_org": "stabilityai",
    "hf_repo": "stable-audio-open-1.0",
    "title": "Stable Audio Open",
    "provider": "Stability AI",
    "description": "Text-to-audio generation model (1.2B params) producing up to ~47 s stereo audio at 44.1 kHz, served via vLLM-Omni",
    "date_updated": "2026-04-17",
    "difficulty": "beginner",
    "min_vllm_version": "0.14.1",
    "architecture": "dense",
    "parameter_count": "1.2B",
    "active_parameters": "1.2B",
    "context_length": 0,
    "tasks": [
      "omni"
    ],
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 19
      }
    },
    "compatible_strategies": [],
    "features": [],
    "opt_in_features": [],
    "url": "/stabilityai/stable-audio-open-1.0",
    "json": "/stabilityai/stable-audio-open-1.0.json"
  },
  {
    "hf_id": "stabilityai/stable-diffusion-3.5-medium",
    "hf_org": "stabilityai",
    "hf_repo": "stable-diffusion-3.5-medium",
    "title": "Stable Diffusion 3.5",
    "provider": "Stability AI",
    "description": "Stability AI's Stable Diffusion 3.5 text-to-image family (medium 2.5B, large 8.1B, large-turbo) via vLLM-Omni with Cache-DiT acceleration",
    "date_updated": "2026-04-17",
    "difficulty": "beginner",
    "min_vllm_version": "0.12.0",
    "architecture": "dense",
    "parameter_count": "2.5B",
    "active_parameters": "2.5B",
    "context_length": 0,
    "tasks": [
      "omni"
    ],
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 44
      },
      "large": {
        "precision": "bf16",
        "vram_minimum_gb": 24
      },
      "large_turbo": {
        "precision": "bf16",
        "vram_minimum_gb": 24
      }
    },
    "compatible_strategies": [],
    "features": [],
    "opt_in_features": [],
    "url": "/stabilityai/stable-diffusion-3.5-medium",
    "json": "/stabilityai/stable-diffusion-3.5-medium.json"
  },
  {
    "hf_id": "stepfun-ai/Step-3.5-Flash",
    "hf_org": "stepfun-ai",
    "hf_repo": "Step-3.5-Flash",
    "title": "Step-3.5-Flash",
    "provider": "StepFun",
    "description": "Production-grade reasoning MoE (~196B total / 11B active parameters) with hybrid attention schedules, SWA compensation, and multi-token prediction for low-latency long-context inference",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.11.0",
    "architecture": "moe",
    "parameter_count": "196B",
    "active_parameters": "11B",
    "context_length": 262144,
    "tasks": [
      "text"
    ],
    "performance_headline": "Sparse MoE reasoning model with hybrid attention and step3p5 MTP speculative decoding",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 470
      },
      "fp8": {
        "precision": "fp8",
        "vram_minimum_gb": 235
      },
      "int4": {
        "precision": "int4",
        "vram_minimum_gb": 118
      },
      "int8": {
        "precision": "int8",
        "vram_minimum_gb": 235
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "single_node_dep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_tep",
      "multi_node_dep"
    ],
    "features": [
      "tool_calling",
      "reasoning",
      "spec_decoding"
    ],
    "opt_in_features": [
      "spec_decoding"
    ],
    "url": "/stepfun-ai/Step-3.5-Flash",
    "json": "/stepfun-ai/Step-3.5-Flash.json"
  },
  {
    "hf_id": "tencent/Hunyuan-A13B-Instruct",
    "hf_org": "tencent",
    "hf_repo": "Hunyuan-A13B-Instruct",
    "title": "Hunyuan-A13B-Instruct",
    "provider": "Tencent Hunyuan",
    "description": "Tencent Hunyuan A13B instruct-tuned MoE language model with AITER-accelerated AMD ROCm deployment",
    "date_updated": "2026-04-17",
    "difficulty": "beginner",
    "min_vllm_version": "0.11.0",
    "architecture": "moe",
    "parameter_count": "80B",
    "active_parameters": "13B",
    "context_length": 32768,
    "tasks": [
      "text"
    ],
    "performance_headline": "Hunyuan-A13B MoE with AITER acceleration on AMD MI300X/MI325X/MI355X",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 580
      },
      "fp8": {
        "precision": "fp8",
        "vram_minimum_gb": 96
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_tep",
      "multi_node_dep"
    ],
    "features": [],
    "opt_in_features": [],
    "url": "/tencent/Hunyuan-A13B-Instruct",
    "json": "/tencent/Hunyuan-A13B-Instruct.json"
  },
  {
    "hf_id": "tencent/HunyuanOCR",
    "hf_org": "tencent",
    "hf_repo": "HunyuanOCR",
    "title": "HunyuanOCR",
    "provider": "Tencent Hunyuan",
    "description": "Tencent Hunyuan end-to-end OCR expert VLM (~1B) for online OCR serving with an OpenAI-compatible API",
    "date_updated": "2026-04-17",
    "difficulty": "beginner",
    "min_vllm_version": "0.11.0",
    "architecture": "dense",
    "parameter_count": "1B",
    "active_parameters": "1B",
    "context_length": 32768,
    "tasks": [
      "multimodal"
    ],
    "performance_headline": "Compact 1B end-to-end OCR VLM from the Hunyuan native multimodal family",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 2
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp"
    ],
    "features": [],
    "opt_in_features": [],
    "url": "/tencent/HunyuanOCR",
    "json": "/tencent/HunyuanOCR.json"
  },
  {
    "hf_id": "zai-org/GLM-4.5",
    "hf_org": "zai-org",
    "hf_repo": "GLM-4.5",
    "title": "GLM-4.5",
    "provider": "GLM (Z-AI)",
    "description": "GLM-4.5 MoE language model (~358B total parameters, BF16) with built-in MTP layers for speculative decoding and native tool calling",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.11.0",
    "architecture": "moe",
    "parameter_count": "358B",
    "active_parameters": "32B",
    "context_length": 131072,
    "tasks": [
      "text"
    ],
    "performance_headline": "GLM-4.X series MoE model with native FP8 and BF16 support and MTP speculative decoding",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 859
      },
      "fp8": {
        "precision": "fp8",
        "vram_minimum_gb": 430
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "single_node_dep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_dep",
      "multi_node_tep"
    ],
    "features": [
      "tool_calling",
      "reasoning",
      "spec_decoding"
    ],
    "opt_in_features": [
      "spec_decoding"
    ],
    "url": "/zai-org/GLM-4.5",
    "json": "/zai-org/GLM-4.5.json"
  },
  {
    "hf_id": "zai-org/GLM-4.5V",
    "hf_org": "zai-org",
    "hf_repo": "GLM-4.5V",
    "title": "GLM-4.5V",
    "provider": "GLM (Z-AI)",
    "description": "GLM-4.5 vision-language MoE model (~107B parameters, BF16) with image-text-to-text capability, 64K context, expert parallelism, and native FP8",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.12.0",
    "architecture": "moe",
    "parameter_count": "107B",
    "active_parameters": "12B",
    "context_length": 65536,
    "tasks": [
      "multimodal"
    ],
    "performance_headline": "Multimodal GLM-4.5V with native FP8 and expert parallelism, deploys on 4xH100",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 257
      },
      "fp8": {
        "precision": "fp8",
        "vram_minimum_gb": 128
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "single_node_dep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_tep",
      "multi_node_dep"
    ],
    "features": [
      "tool_calling",
      "reasoning"
    ],
    "opt_in_features": [],
    "url": "/zai-org/GLM-4.5V",
    "json": "/zai-org/GLM-4.5V.json"
  },
  {
    "hf_id": "zai-org/GLM-4.6",
    "hf_org": "zai-org",
    "hf_repo": "GLM-4.6",
    "title": "GLM-4.6",
    "provider": "GLM (Z-AI)",
    "description": "GLM-4.6 MoE language model (~357B total parameters, BF16) with MTP speculative decoding, native tool calling and reasoning",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.11.0",
    "architecture": "moe",
    "parameter_count": "357B",
    "active_parameters": "32B",
    "context_length": 202752,
    "tasks": [
      "text"
    ],
    "performance_headline": "Updated GLM-4.X series MoE model with native FP8 and BF16, MTP speculative decoding",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 857
      },
      "fp8": {
        "precision": "fp8",
        "vram_minimum_gb": 428
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "single_node_dep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_dep",
      "multi_node_tep"
    ],
    "features": [
      "tool_calling",
      "reasoning",
      "spec_decoding"
    ],
    "opt_in_features": [
      "spec_decoding"
    ],
    "url": "/zai-org/GLM-4.6",
    "json": "/zai-org/GLM-4.6.json"
  },
  {
    "hf_id": "zai-org/GLM-4.6V",
    "hf_org": "zai-org",
    "hf_repo": "GLM-4.6V",
    "title": "GLM-4.6V",
    "provider": "GLM (Z-AI)",
    "description": "GLM-4.6 vision-language MoE model — image-text-to-text with 128K context, native FP8 checkpoint, and expert parallelism",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.12.0",
    "architecture": "moe",
    "parameter_count": "107B",
    "active_parameters": "12B",
    "context_length": 131072,
    "tasks": [
      "multimodal"
    ],
    "performance_headline": "Updated GLM-V series with 128K context length and native FP8",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 257
      },
      "fp8": {
        "precision": "fp8",
        "vram_minimum_gb": 128
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "single_node_dep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_tep",
      "multi_node_dep"
    ],
    "features": [
      "tool_calling",
      "reasoning"
    ],
    "opt_in_features": [],
    "url": "/zai-org/GLM-4.6V",
    "json": "/zai-org/GLM-4.6V.json"
  },
  {
    "hf_id": "zai-org/GLM-4.7",
    "hf_org": "zai-org",
    "hf_repo": "GLM-4.7",
    "title": "GLM-4.7",
    "provider": "GLM (Z-AI)",
    "description": "GLM-4.7 MoE language model (~358B total parameters) with MTP speculative decoding, updated tool call parser, and reasoning support",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.11.0",
    "architecture": "moe",
    "parameter_count": "358B",
    "active_parameters": "32B",
    "context_length": 202752,
    "tasks": [
      "text"
    ],
    "performance_headline": "Latest GLM-4.X release with updated glm47 tool call parser and MTP speculative decoding",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 859
      },
      "fp8": {
        "precision": "fp8",
        "vram_minimum_gb": 430
      },
      "nvfp4": {
        "precision": "nvfp4",
        "vram_minimum_gb": 215
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "single_node_dep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_dep",
      "multi_node_tep"
    ],
    "features": [
      "tool_calling",
      "reasoning",
      "spec_decoding"
    ],
    "opt_in_features": [
      "spec_decoding"
    ],
    "url": "/zai-org/GLM-4.7",
    "json": "/zai-org/GLM-4.7.json"
  },
  {
    "hf_id": "zai-org/GLM-5.1",
    "hf_org": "zai-org",
    "hf_repo": "GLM-5.1",
    "title": "GLM-5.1",
    "provider": "GLM (Z-AI)",
    "description": "GLM-5.1 refreshed version of GLM-5 — frontier-scale MoE language model (~744B total parameters) with MTP speculative decoding and thinking mode",
    "date_updated": "2026-04-17",
    "difficulty": "advanced",
    "min_vllm_version": "0.19.0",
    "architecture": "moe",
    "parameter_count": "744B",
    "active_parameters": "40B",
    "context_length": 202752,
    "tasks": [
      "text"
    ],
    "performance_headline": "Refreshed GLM-5 series MoE with improved reasoning, coding, and agentic performance",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 1786
      },
      "fp8": {
        "precision": "fp8",
        "vram_minimum_gb": 893
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_tep",
      "multi_node_dep"
    ],
    "features": [
      "tool_calling",
      "reasoning",
      "spec_decoding"
    ],
    "opt_in_features": [
      "spec_decoding"
    ],
    "url": "/zai-org/GLM-5.1",
    "json": "/zai-org/GLM-5.1.json"
  },
  {
    "hf_id": "zai-org/GLM-5",
    "hf_org": "zai-org",
    "hf_repo": "GLM-5",
    "title": "GLM-5",
    "provider": "GLM (Z-AI)",
    "description": "GLM-5 frontier-scale MoE language model (~744B total parameters, 28.5T training tokens) with asynchronous RL infrastructure for reasoning, coding, and agentic tasks",
    "date_updated": "2026-04-17",
    "difficulty": "advanced",
    "min_vllm_version": "0.16.0",
    "architecture": "moe",
    "parameter_count": "744B",
    "active_parameters": "40B",
    "context_length": 202752,
    "tasks": [
      "text"
    ],
    "performance_headline": "Frontier-scale MoE with 744B parameters, best-in-class open-source performance on reasoning/coding/agents",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 1786
      },
      "fp8": {
        "precision": "fp8",
        "vram_minimum_gb": 893
      },
      "nvfp4": {
        "precision": "nvfp4",
        "vram_minimum_gb": 446
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "single_node_tep",
      "multi_node_tp",
      "multi_node_tp_pp",
      "multi_node_tep",
      "multi_node_dep"
    ],
    "features": [
      "tool_calling",
      "reasoning",
      "spec_decoding"
    ],
    "opt_in_features": [
      "spec_decoding"
    ],
    "url": "/zai-org/GLM-5",
    "json": "/zai-org/GLM-5.json"
  },
  {
    "hf_id": "zai-org/GLM-ASR-Nano-2512",
    "hf_org": "zai-org",
    "hf_repo": "GLM-ASR-Nano-2512",
    "title": "GLM-ASR-Nano-2512",
    "provider": "GLM (Z-AI)",
    "description": "Open-source speech recognition model (~2B) with strong dialect support (Cantonese and others) and robust low-volume speech transcription",
    "date_updated": "2026-04-17",
    "difficulty": "beginner",
    "min_vllm_version": "0.14.1",
    "architecture": "dense",
    "parameter_count": "2.3B",
    "active_parameters": "1.5B",
    "context_length": 8192,
    "tasks": [
      "multimodal"
    ],
    "performance_headline": "Outperforms Whisper V3 on multiple benchmarks at compact 1.5B active / 2B total size",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 11
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp"
    ],
    "features": [],
    "opt_in_features": [],
    "url": "/zai-org/GLM-ASR-Nano-2512",
    "json": "/zai-org/GLM-ASR-Nano-2512.json"
  },
  {
    "hf_id": "zai-org/GLM-Image",
    "hf_org": "zai-org",
    "hf_repo": "GLM-Image",
    "title": "GLM-Image",
    "provider": "GLM (Z-AI)",
    "description": "Hybrid autoregressive + diffusion image generation model — text-to-image and image-to-image with strong text rendering and knowledge-intensive generation",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.11.0",
    "architecture": "dense",
    "parameter_count": "16B",
    "active_parameters": "16B",
    "context_length": 4096,
    "tasks": [
      "multimodal"
    ],
    "performance_headline": "9B AR generator + 7B DiT decoder, state-of-the-art text rendering in generated images",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 38
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp"
    ],
    "features": [],
    "opt_in_features": [],
    "url": "/zai-org/GLM-Image",
    "json": "/zai-org/GLM-Image.json"
  },
  {
    "hf_id": "zai-org/GLM-OCR",
    "hf_org": "zai-org",
    "hf_repo": "GLM-OCR",
    "title": "GLM-OCR",
    "provider": "GLM (Z-AI)",
    "description": "GLM-OCR image-to-text model with built-in MTP speculative decoding for high-throughput OCR serving",
    "date_updated": "2026-04-17",
    "difficulty": "beginner",
    "min_vllm_version": "0.12.0",
    "architecture": "dense",
    "parameter_count": "0.9B",
    "active_parameters": "0.9B",
    "context_length": 131072,
    "tasks": [
      "multimodal"
    ],
    "performance_headline": "Multilingual end-to-end OCR VLM with MTP-accelerated decoding",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 2
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp"
    ],
    "features": [
      "spec_decoding"
    ],
    "opt_in_features": [],
    "url": "/zai-org/GLM-OCR",
    "json": "/zai-org/GLM-OCR.json"
  },
  {
    "hf_id": "zai-org/Glyph",
    "hf_org": "zai-org",
    "hf_repo": "Glyph",
    "title": "Glyph",
    "provider": "GLM (Z-AI)",
    "description": "Visual-text compression framework that renders long text into images and processes them with a reasoning VLM, scaling effective context length",
    "date_updated": "2026-04-17",
    "difficulty": "intermediate",
    "min_vllm_version": "0.11.0",
    "architecture": "dense",
    "parameter_count": "10B",
    "active_parameters": "10B",
    "context_length": 131072,
    "tasks": [
      "multimodal"
    ],
    "performance_headline": "Reasoning multimodal model for visual-text compression, single-GPU deployable",
    "variants": {
      "default": {
        "precision": "bf16",
        "vram_minimum_gb": 24
      }
    },
    "compatible_strategies": [
      "single_node_tp",
      "multi_node_tp",
      "multi_node_tp_pp"
    ],
    "features": [
      "reasoning"
    ],
    "opt_in_features": [],
    "url": "/zai-org/Glyph",
    "json": "/zai-org/Glyph.json"
  }
]