FailureSensorIQ leaderboard

Intro text

{
  • "headers": [
    • "T",
    • "Model",
    • "Average โฌ†๏ธ",
    • "Acc_All",
    • "Acc_Sel",
    • "Acc_El",
    • "Acc_Perturb",
    • "Consistency_Score",
    • "Type",
    • "Architecture",
    • "Precision",
    • "Hub License",
    • "#Params (B)",
    • "Hub โค๏ธ",
    • "Available on the hub",
    • "Model sha"
    ],
  • "data": [
    • [
      • "โญ•",
      • "<a target="_blank" href="https://openai.com/index/openai-o3-mini/" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">openai/o3-mini</a>",
      • 57.17,
      • 58.46,
      • 56.81,
      • 69.84,
      • 53.28,
      • 47.47,
      • "instruction-tuned",
      • "?",
      • "bfloat16",
      • "apache-2.0",
      • 500,
      • 78,
      • false,
      • "main"
      ],
    • [
      • "โญ•",
      • "<a target="_blank" href="https://openai.com/o1/" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">openai/o1</a>",
      • 56.66,
      • 60.4,
      • 61.06,
      • 67.89,
      • 49.76,
      • 44.17,
      • "instruction-tuned",
      • "?",
      • "bfloat16",
      • "apache-2.0",
      • 500,
      • 78,
      • false,
      • "main"
      ],
    • [
      • "โญ•",
      • "<a target="_blank" href="https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Original" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">meta-llama/Llama-4-Maverick-17B-128E-Original</a>",
      • 52.43,
      • 55.83,
      • 44.47,
      • 71.9,
      • 49.12,
      • 40.83,
      • "instruction-tuned",
      • "?",
      • "bfloat16",
      • "apache-2.0",
      • 400,
      • 78,
      • false,
      • "main"
      ],
    • [
      • "โญ•",
      • "<a target="_blank" href="https://huggingface.co/openai/gpt-4.1" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">openai/gpt-4.1</a>",
      • 50.94,
      • 53.47,
      • 56.38,
      • 59.17,
      • 45.74,
      • 39.93,
      • "instruction-tuned",
      • "?",
      • "bfloat16",
      • "apache-2.0",
      • 500,
      • 78,
      • false,
      • "main"
      ],
    • [
      • "๐ŸŸฆ",
      • "<a target="_blank" href="https://huggingface.co/deepseek-ai/DeepSeek-R1" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">deepseek-ai/DeepSeek-R1</a>",
      • 50.58,
      • 50.09,
      • 45.74,
      • 59.75,
      • 54.37,
      • 42.93,
      • "RL-tuned",
      • "DeepseekV3ForCausalLM",
      • "bfloat16",
      • "apache-2.0",
      • 685,
      • 78,
      • true,
      • "main"
      ],
    • [
      • "โญ•",
      • "<a target="_blank" href="https://openai.com/index/introducing-openai-o1-preview/" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">openai/o1-preview</a>",
      • 50.51,
      • 52.31,
      • 49.57,
      • 62.5,
      • 47.51,
      • 40.68,
      • "instruction-tuned",
      • "?",
      • "bfloat16",
      • "apache-2.0",
      • 500,
      • 78,
      • false,
      • "main"
      ],
    • [
      • "โญ•",
      • "<a target="_blank" href="https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">meta-llama/Llama-3.1-405B-Instruct</a>",
      • 46.44,
      • 51.26,
      • 48.72,
      • 61.24,
      • 40.04,
      • 30.93,
      • "instruction-tuned",
      • "LlamaForCausalLM",
      • "bfloat16",
      • "apache-2.0",
      • 405,
      • 78,
      • true,
      • "main"
      ],
    • [
      • "โญ•",
      • "<a target="_blank" href="https://huggingface.co/mistralai/Mistral-Large-Instruct-2407" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">mistralai/Mistral-Large-Instruct-2407</a>",
      • 45.27,
      • 50.09,
      • 51.28,
      • 57.57,
      • 38.1,
      • 29.32,
      • "instruction-tuned",
      • "MistralForCausalLM",
      • "bfloat16",
      • "apache-2.0",
      • 123,
      • 78,
      • true,
      • "main"
      ],
    • [
      • "โญ•",
      • "<a target="_blank" href="https://huggingface.co/openai/gpt-4.1-mini" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">openai/gpt-4.1-mini</a>",
      • 44.77,
      • 49.27,
      • 45.53,
      • 57.34,
      • 39.97,
      • 31.76,
      • "instruction-tuned",
      • "?",
      • "bfloat16",
      • "apache-2.0",
      • 500,
      • 78,
      • false,
      • "main"
      ],
    • [
      • "๐ŸŸฆ",
      • "<a target="_blank" href="https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">deepseek-ai/DeepSeek-R1-Distill-Llama-70B</a>",
      • 44.29,
      • 44.62,
      • 36.38,
      • 65.14,
      • 44.99,
      • 30.3,
      • "RL-tuned",
      • "LlamaForCausalLM",
      • "bfloat16",
      • "apache-2.0",
      • 70.6,
      • 78,
      • true,
      • "main"
      ],
    • [
      • "โญ•",
      • "<a target="_blank" href="https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">meta-llama/Llama-4-Scout-17B-16E</a>",
      • 43.08,
      • 53.96,
      • 44.47,
      • 63.53,
      • 29.36,
      • 24.11,
      • "instruction-tuned",
      • "?",
      • "bfloat16",
      • "apache-2.0",
      • 109,
      • 78,
      • false,
      • "main"
      ],
    • [
      • "โญ•",
      • "<a target="_blank" href="https://huggingface.co/microsoft/phi-4" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">microsoft/phi-4</a>",
      • 43.04,
      • 48.56,
      • 40.43,
      • 60.32,
      • 36.3,
      • 29.62,
      • "instruction-tuned",
      • "Phi3ForCausalLM",
      • "bfloat16",
      • "apache-2.0",
      • 14.7,
      • 78,
      • true,
      • "main"
      ],
    • [
      • "โญ•",
      • "<a target="_blank" href="https://huggingface.co/google/gemma-2-9b" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">google/gemma-2-9b</a>",
      • 42.87,
      • 43.98,
      • 30.43,
      • 58.6,
      • 45.29,
      • 36.07,
      • "instruction-tuned",
      • "?",
      • "bfloat16",
      • "apache-2.0",
      • 9.24,
      • 78,
      • false,
      • "main"
      ],
    • [
      • "โญ•",
      • "<a target="_blank" href="https://huggingface.co/meta-llama/Llama-3.2-11B-Vision" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">meta-llama/Llama-3.2-11B-Vision</a>",
      • 40.1,
      • 39.11,
      • 33.83,
      • 50.92,
      • 45.74,
      • 30.9,
      • "instruction-tuned",
      • "?",
      • "bfloat16",
      • "apache-2.0",
      • 70.6,
      • 78,
      • false,
      • "main"
      ],
    • [
      • "โญ•",
      • "<a target="_blank" href="https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">meta-llama/Llama-3.3-70B-Instruct</a>",
      • 39.1,
      • 41.69,
      • 35.11,
      • 55.85,
      • 37.57,
      • 25.27,
      • "instruction-tuned",
      • "LlamaForCausalLM",
      • "bfloat16",
      • "apache-2.0",
      • 70.6,
      • 78,
      • true,
      • "main"
      ],
    • [
      • "โญ•",
      • "<a target="_blank" href="https://huggingface.co/mistralai/Mixtral-8x22B-v0.1" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">mistralai/Mixtral-8x22B-v0.1</a>",
      • 38.78,
      • 45.18,
      • 42.55,
      • 59.06,
      • 27.52,
      • 19.57,
      • "instruction-tuned",
      • "MixtralForCausalLM",
      • "bfloat16",
      • "apache-2.0",
      • 46.7,
      • 78,
      • true,
      • "main"
      ],
    • [
      • "๐ŸŸฆ",
      • "<a target="_blank" href="https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">deepseek-ai/DeepSeek-R1-Distill-Llama-8B</a>",
      • 35.37,
      • 43.04,
      • 38.94,
      • 54.36,
      • 24.26,
      • 16.27,
      • "RL-tuned",
      • "LlamaForCausalLM",
      • "bfloat16",
      • "apache-2.0",
      • 8.03,
      • 78,
      • true,
      • "main"
      ],
    • [
      • "โญ•",
      • "<a target="_blank" href="https://huggingface.co/Qwen/Qwen2.5-7B-Instruct" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">Qwen/Qwen2.5-7B-Instruct</a>",
      • 34.71,
      • 38.73,
      • 40.64,
      • 49.54,
      • 28.2,
      • 16.42,
      • "instruction-tuned",
      • "Qwen2ForCausalLM",
      • "bfloat16",
      • "apache-2.0",
      • 7.62,
      • 78,
      • true,
      • "main"
      ],
    • [
      • "โญ•",
      • "<a target="_blank" href="https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">meta-llama/Llama-3.1-8B-Instruct</a>",
      • 33.33,
      • 40.04,
      • 36.17,
      • 51.15,
      • 25.35,
      • 13.95,
      • "instruction-tuned",
      • "LlamaForCausalLM",
      • "bfloat16",
      • "apache-2.0",
      • 8.03,
      • 78,
      • true,
      • "main"
      ],
    • [
      • "โญ•",
      • "<a target="_blank" href="https://huggingface.co/openai/gpt-4.1-nano" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">openai/gpt-4.1-nano</a>",
      • 31.18,
      • 41.77,
      • 40,
      • 50.34,
      • 14.47,
      • 9.3,
      • "instruction-tuned",
      • "?",
      • "bfloat16",
      • "apache-2.0",
      • 500,
      • 78,
      • false,
      • "main"
      ],
    • [
      • "๐ŸŸฆ",
      • "<a target="_blank" href="https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">deepseek-ai/DeepSeek-R1-Distill-Qwen-7B</a>",
      • 26.6,
      • 34.01,
      • 23.83,
      • 50.11,
      • 17.02,
      • 8.02,
      • "RL-tuned",
      • "Qwen2ForCausalLM",
      • "bfloat16",
      • "apache-2.0",
      • 7.62,
      • 78,
      • true,
      • "main"
      ],
    • [
      • "โญ•",
      • "<a target="_blank" href="https://huggingface.co/ibm-granite/granite-3.2-8b-instruct" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">ibm-granite/granite-3.2-8b-instruct</a>",
      • 25.95,
      • 30.26,
      • 41.7,
      • 29.82,
      • 19.24,
      • 8.74,
      • "instruction-tuned",
      • "GraniteForCausalLM",
      • "bfloat16",
      • "apache-2.0",
      • 8.17,
      • 78,
      • true,
      • "main"
      ],
    • [
      • "โญ•",
      • "<a target="_blank" href="https://huggingface.co/ibm-granite/granite-3.3-8b-instruct" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">ibm-granite/granite-3.3-8b-instruct</a>",
      • 25.11,
      • 25.83,
      • 32.13,
      • 29.47,
      • 23.73,
      • 14.4,
      • "instruction-tuned",
      • "GraniteForCausalLM",
      • "bfloat16",
      • "apache-2.0",
      • 8.17,
      • 78,
      • true,
      • "main"
      ],
    • [
      • "โญ•",
      • "<a target="_blank" href="https://huggingface.co/mistralai/Mixtral-8x7B-v0.1" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">mistralai/Mixtral-8x7B-v0.1</a>",
      • 21.36,
      • 27.6,
      • 25.32,
      • 38.19,
      • 11.21,
      • 4.46,
      • "instruction-tuned",
      • "MixtralForCausalLM",
      • "bfloat16",
      • "apache-2.0",
      • 46.7,
      • 78,
      • true,
      • "main"
      ],
    • [
      • "โญ•",
      • "<a target="_blank" href="https://huggingface.co/ibm-granite/granite-3.0-8b-instruct" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">ibm-granite/granite-3.0-8b-instruct</a>",
      • 19.34,
      • 22.8,
      • 16.17,
      • 36.7,
      • 16.16,
      • 4.87,
      • "instruction-tuned",
      • "GraniteForCausalLM",
      • "bfloat16",
      • "apache-2.0",
      • 8.17,
      • 78,
      • true,
      • "main"
      ]
    ],
  • "metadata": null
}