{
  "version": 1,
  "generated_at": "2026-05-01T20:40:58.895849Z",
  "run_count": 120,
  "repeat_groups": [
    {
      "group_id": "8f6fb5570231a90f",
      "task_fingerprint": "0eb7fe892594e382024c3813de8ec46058d12d33d835017ca9df11d4d238d61e",
      "normalized_tag_key": "*-ing*;adverbial;clause;english;syntax",
      "task_name_display": "ADV *-ing* clause",
      "task_names_seen": [
        "ADV *-ing* clause"
      ],
      "tags_display": "*-ing*; syntax; adverbial; clause; English",
      "provider": "openai",
      "model": "gpt5",
      "run_count": 2,
      "alpha_nominal": 0.9785732474808105,
      "pairable_item_count": 295,
      "rated_item_count": 307,
      "fully_shared_item_count": 295,
      "category_count": 3,
      "run_stems": [
        "adv-ing____gpt5__2025-11-09-00-09",
        "adv-ing____gpt5__2025-11-08-17-24"
      ],
      "metrics_files": [
        "adv-ing____gpt5__2025-11-09-00-09__metrics.json",
        "adv-ing____gpt5__2025-11-08-17-24__metrics.json"
      ],
      "timestamps": [
        "2025-11-08T23:09:42.353960Z",
        "2025-11-08T16:24:26.999467Z"
      ],
      "accuracies": [
        0.9025157232704403,
        0.9245283018867925
      ]
    },
    {
      "group_id": "dbae833088ea91ef",
      "task_fingerprint": "0eb7fe892594e382024c3813de8ec46058d12d33d835017ca9df11d4d238d61e",
      "normalized_tag_key": "*-ing*;adverbial;clause;english;syntax",
      "task_name_display": "ADV *-ing* clause",
      "task_names_seen": [
        "ADV *-ing* clause"
      ],
      "tags_display": "*-ing*; syntax; adverbial; clause; English",
      "provider": "google",
      "model": "gemini-3-flash-preview",
      "run_count": 2,
      "alpha_nominal": 0.9731878015052833,
      "pairable_item_count": 318,
      "rated_item_count": 318,
      "fully_shared_item_count": 318,
      "category_count": 3,
      "run_stems": [
        "adv-ing__google__modelsgemini3flashpreview__2026-03-11-15-40",
        "adv-ing__vertex__gemini3flashpreview__2026-03-11-01-07"
      ],
      "metrics_files": [
        "adv-ing__google__modelsgemini3flashpreview__2026-03-11-15-40__metrics.json",
        "adv-ing__vertex__gemini3flashpreview__2026-03-11-01-07__metrics.json"
      ],
      "timestamps": [
        "2026-03-11T14:40:24.117684Z",
        "2026-03-11T10:11:38.314790Z"
      ],
      "accuracies": [
        0.9811320754716981,
        0.9748427672955975
      ]
    },
    {
      "group_id": "7117b4cf3248e0be",
      "task_fingerprint": "5d2775386b7429a15d795e64be76f5d0829aff4b3b67fc8acae71e4adebccffd",
      "normalized_tag_key": "morphology;number;old english",
      "task_name_display": "OE number",
      "task_names_seen": [
        "OE number"
      ],
      "tags_display": "Old English; morphology; number",
      "provider": "vertex",
      "model": "gemini-3-flash-preview",
      "run_count": 6,
      "alpha_nominal": 0.9675264404461101,
      "pairable_item_count": 1200,
      "rated_item_count": 1200,
      "fully_shared_item_count": 1182,
      "category_count": 2,
      "run_stems": [
        "OE_number__vertex__gemini3flashpreview__2026-03-11-00-23",
        "OE_number__vertex__gemini3flashpreview__2026-03-10-22-35",
        "OE_number__vertex__gemini3flashpreview__2026-03-10-22-02",
        "OE_number__vertex__gemini3flashpreview__2026-03-10-21-17",
        "OE_number__vertex__gemini3flashpreview__2026-02-27-02-12",
        "OE_number__vertex__gemini3flashpreview__2026-02-23-16-58"
      ],
      "metrics_files": [
        "OE_number__vertex__gemini3flashpreview__2026-03-11-00-23__metrics.json",
        "OE_number__vertex__gemini3flashpreview__2026-03-10-22-35__metrics.json",
        "OE_number__vertex__gemini3flashpreview__2026-03-10-22-02__metrics.json",
        "OE_number__vertex__gemini3flashpreview__2026-03-10-21-17__metrics.json",
        "OE_number__vertex__gemini3flashpreview__2026-02-27-02-12__metrics.json",
        "OE_number__vertex__gemini3flashpreview__2026-02-23-16-58__metrics.json"
      ],
      "timestamps": [
        "2026-03-10T23:24:01.576132Z",
        "2026-03-10T21:35:33.887375Z",
        "2026-03-10T21:02:30.244324Z",
        "2026-03-10T20:18:02.213187Z",
        "2026-02-27T01:12:08.626686Z",
        "2026-02-23T15:58:09.118614Z"
      ],
      "accuracies": [
        0.9783333333333334,
        0.9783333333333334,
        0.9775,
        0.955,
        0.9791666666666666,
        0.9783333333333334
      ]
    },
    {
      "group_id": "e771e84742134b17",
      "task_fingerprint": "0eb7fe892594e382024c3813de8ec46058d12d33d835017ca9df11d4d238d61e",
      "normalized_tag_key": "*-ing*;adverbial;clause;english;syntax",
      "task_name_display": "ADV *-ing* clause",
      "task_names_seen": [
        "ADV *-ing* clause"
      ],
      "tags_display": "*-ing*; syntax; adverbial; clause; English",
      "provider": "requesty",
      "model": "claude-sonnet-4-6",
      "run_count": 2,
      "alpha_nominal": 0.9662981040608294,
      "pairable_item_count": 305,
      "rated_item_count": 310,
      "fully_shared_item_count": 305,
      "category_count": 3,
      "run_stems": [
        "adv-ing__requesty__anthropicclaudesonnet46__2026-03-11-16-11",
        "adv-ing__requesty__anthropicclaudesonnet46__2026-03-11-16-06"
      ],
      "metrics_files": [
        "adv-ing__requesty__anthropicclaudesonnet46__2026-03-11-16-11__metrics.json",
        "adv-ing__requesty__anthropicclaudesonnet46__2026-03-11-16-06__metrics.json"
      ],
      "timestamps": [
        "2026-03-11T15:11:34.763226Z",
        "2026-03-11T15:06:20.109974Z"
      ],
      "accuracies": [
        0.9308176100628931,
        0.9056603773584906
      ]
    },
    {
      "group_id": "9d01f5b7f24ac4bc",
      "task_fingerprint": "cd60855ae86dc7a8391248627a68bbc5a77fabf2d75d0b967118c09aec3d8faa",
      "normalized_tag_key": "*like*;discourse;english;pragmatics;v3",
      "task_name_display": "*like* discourse/pragm",
      "task_names_seen": [
        "*like* discourse/pragm"
      ],
      "tags_display": "*like*; discourse; pragmatics; English; v3",
      "provider": "vertex",
      "model": "gemini-3-flash-preview",
      "run_count": 2,
      "alpha_nominal": 0.9636692421502548,
      "pairable_item_count": 111,
      "rated_item_count": 112,
      "fully_shared_item_count": 111,
      "category_count": 4,
      "run_stems": [
        "like_interrater__vertex__gemini3flashpreview__2026-03-16-23-22",
        "like_interrater__vertex__gemini3flashpreview__2026-03-16-20-22"
      ],
      "metrics_files": [
        "like_interrater__vertex__gemini3flashpreview__2026-03-16-23-22__metrics.json",
        "like_interrater__vertex__gemini3flashpreview__2026-03-16-20-22__metrics.json"
      ],
      "timestamps": [
        "2026-03-16T22:22:38.058348Z",
        "2026-03-16T19:22:29.614294Z"
      ],
      "accuracies": [
        0.9130434782608695,
        0.9217391304347826
      ]
    },
    {
      "group_id": "d2ad6ced0669cd8e",
      "task_fingerprint": "0eb7fe892594e382024c3813de8ec46058d12d33d835017ca9df11d4d238d61e",
      "normalized_tag_key": "*-ing*;adverbial;clause;english;syntax",
      "task_name_display": "ADV *-ing* clause",
      "task_names_seen": [
        "ADV *-ing* clause"
      ],
      "tags_display": "*-ing*; syntax; adverbial; clause; English",
      "provider": "e-infra",
      "model": "qwen3.5",
      "run_count": 2,
      "alpha_nominal": 0.9621555534656259,
      "pairable_item_count": 252,
      "rated_item_count": 281,
      "fully_shared_item_count": 252,
      "category_count": 3,
      "run_stems": [
        "adv-ing__einfra__qwen35__2026-03-20-15-45",
        "adv-ing__einfra__qwen35__2026-03-12-14-37"
      ],
      "metrics_files": [
        "adv-ing__einfra__qwen35__2026-03-20-15-45__metrics.json",
        "adv-ing__einfra__qwen35__2026-03-12-14-37__metrics.json"
      ],
      "timestamps": [
        "2026-03-20T14:45:23.934528Z",
        "2026-03-12T13:37:29.843039Z"
      ],
      "accuracies": [
        0.8113207547169812,
        0.8238993710691824
      ]
    },
    {
      "group_id": "0816f7ef27ef6768",
      "task_fingerprint": "0eb7fe892594e382024c3813de8ec46058d12d33d835017ca9df11d4d238d61e",
      "normalized_tag_key": "*-ing*;adverbial;clause;english;syntax",
      "task_name_display": "ADV *-ing* clause",
      "task_names_seen": [
        "ADV *-ing* clause"
      ],
      "tags_display": "*-ing*; syntax; adverbial; clause; English",
      "provider": "openai",
      "model": "gpt-oss-120b",
      "run_count": 2,
      "alpha_nominal": 0.9359091972010865,
      "pairable_item_count": 255,
      "rated_item_count": 309,
      "fully_shared_item_count": 255,
      "category_count": 3,
      "run_stems": [
        "adv-ing____gptoss120b__2025-11-08-23-28",
        "adv-ing____gptoss120b__2025-11-08-17-25"
      ],
      "metrics_files": [
        "adv-ing____gptoss120b__2025-11-08-23-28__metrics.json",
        "adv-ing____gptoss120b__2025-11-08-17-25__metrics.json"
      ],
      "timestamps": [
        "2025-11-08T22:28:31.410308Z",
        "2025-11-08T16:25:41.584007Z"
      ],
      "accuracies": [
        0.9308176100628931,
        0.7924528301886793
      ]
    },
    {
      "group_id": "c231858746135469",
      "task_fingerprint": "7326c5be7cb61f6e9585d417047cb29a4d0b0ca63102404c9692e99eeae036b2",
      "normalized_tag_key": "*like*;discourse;english;pragmatics;v1",
      "task_name_display": "*like* discourse/pragm",
      "task_names_seen": [
        "*like* discourse/pragm"
      ],
      "tags_display": "*like*; discourse; pragmatics; English; v1",
      "provider": "requesty",
      "model": "claude-sonnet-4-6",
      "run_count": 2,
      "alpha_nominal": 0.9294218670133443,
      "pairable_item_count": 991,
      "rated_item_count": 991,
      "fully_shared_item_count": 991,
      "category_count": 4,
      "run_stems": [
        "like__requesty__anthropicclaudesonnet46__2026-03-12-15-43",
        "like__requesty__anthropicclaudesonnet46__2026-03-12-14-40"
      ],
      "metrics_files": [
        "like__requesty__anthropicclaudesonnet46__2026-03-12-15-43__metrics.json",
        "like__requesty__anthropicclaudesonnet46__2026-03-12-14-40__metrics.json"
      ],
      "timestamps": [
        "2026-03-12T14:43:10.303101Z",
        "2026-03-12T13:40:58.748047Z"
      ],
      "accuracies": [
        0.8228228228228228,
        0.8298298298298298
      ]
    },
    {
      "group_id": "7f82d6ed0511901d",
      "task_fingerprint": "7326c5be7cb61f6e9585d417047cb29a4d0b0ca63102404c9692e99eeae036b2",
      "normalized_tag_key": "*like*;discourse;english;pragmatics;v1",
      "task_name_display": "*like* discourse/pragm",
      "task_names_seen": [
        "*like* discourse/pragm"
      ],
      "tags_display": "*like*; discourse; pragmatics; English; v1",
      "provider": "requesty",
      "model": "deepseek-v3.2",
      "run_count": 2,
      "alpha_nominal": 0.4980467875030239,
      "pairable_item_count": 995,
      "rated_item_count": 998,
      "fully_shared_item_count": 995,
      "category_count": 6,
      "run_stems": [
        "like__requesty__novitadeepseekdeepseekv32__2026-03-12-17-27",
        "like__requesty__novitadeepseekdeepseekv32__2026-03-12-16-52"
      ],
      "metrics_files": [
        "like__requesty__novitadeepseekdeepseekv32__2026-03-12-17-27__metrics.json",
        "like__requesty__novitadeepseekdeepseekv32__2026-03-12-16-52__metrics.json"
      ],
      "timestamps": [
        "2026-03-12T16:27:57.675261Z",
        "2026-03-12T15:52:04.876466Z"
      ],
      "accuracies": [
        0.6296296296296297,
        0.6446446446446447
      ]
    }
  ],
  "cross_model": {
    "latest": [
      {
        "group_id": "34dd61f01b51cdcd",
        "representative_policy": "latest",
        "task_fingerprint": "eca5b779052ec243fcc76cdc818bae503645042de8cdc27356517265352b7c24",
        "normalized_tag_key": "lemmatization;morphology;old english;v4;validator",
        "task_name_display": "OE lemmatization",
        "task_names_seen": [
          "OE lemmatization"
        ],
        "tags_display": "Old English; lemmatization; morphology; v4; validator",
        "model_count": 3,
        "alpha_nominal": 0.9792224814894694,
        "pairable_item_count": 2846,
        "rated_item_count": 2846,
        "fully_shared_item_count": 2831,
        "category_count": 540,
        "representative_run_stems": [
          "ycoe3__einfra__glm51__2026-04-27-22-51",
          "ycoe3__einfra__kimik26__2026-04-29-01-25",
          "ycoe3__vertex__gemini3flashpreview__2026-04-30-21-56"
        ],
        "representatives": [
          {
            "provider": "e-infra",
            "model": "glm-5.1",
            "run_stem": "ycoe3__einfra__glm51__2026-04-27-22-51",
            "metrics_file": "ycoe3__einfra__glm51__2026-04-27-22-51__metrics.json",
            "timestamp": "2026-04-27T20:51:24.742584Z",
            "accuracy": 0.9739985945186226,
            "cohen_kappa": 0.9735352590737089
          },
          {
            "provider": "e-infra",
            "model": "kimi-k2.6",
            "run_stem": "ycoe3__einfra__kimik26__2026-04-29-01-25",
            "metrics_file": "ycoe3__einfra__kimik26__2026-04-29-01-25__metrics.json",
            "timestamp": "2026-04-28T23:25:53.664337Z",
            "accuracy": 0.9739985945186226,
            "cohen_kappa": 0.973533010784025
          },
          {
            "provider": "vertex",
            "model": "gemini-3-flash-preview",
            "run_stem": "ycoe3__vertex__gemini3flashpreview__2026-04-30-21-56",
            "metrics_file": "ycoe3__vertex__gemini3flashpreview__2026-04-30-21-56__metrics.json",
            "timestamp": "2026-04-30T19:56:09.797765Z",
            "accuracy": 0.9873506676036542,
            "cohen_kappa": 0.9871231511429811
          }
        ]
      },
      {
        "group_id": "46e4f495cf8a7116",
        "representative_policy": "latest",
        "task_fingerprint": "2c0d314be63ddff932b8c577820159b00f71219bdb5eba31e8cbc0736a1243f5",
        "normalized_tag_key": "english;pragmatics;semantics;sentiment analysis",
        "task_name_display": "sentiment analysis",
        "task_names_seen": [
          "sentiment analysis"
        ],
        "tags_display": "sentiment analysis; English; semantics; pragmatics",
        "model_count": 2,
        "alpha_nominal": 0.9649421567580135,
        "pairable_item_count": 399,
        "rated_item_count": 400,
        "fully_shared_item_count": 399,
        "category_count": 2,
        "representative_run_stems": [
          "sentiment__vertex__gemini3flashpreview__2026-04-30-00-04",
          "sentiment__vertex__gemini31flashlitepreview__2026-04-29-23-18"
        ],
        "representatives": [
          {
            "provider": "vertex",
            "model": "gemini-3-flash-preview",
            "run_stem": "sentiment__vertex__gemini3flashpreview__2026-04-30-00-04",
            "metrics_file": "sentiment__vertex__gemini3flashpreview__2026-04-30-00-04__metrics.json",
            "timestamp": "2026-04-29T22:04:02.485410Z",
            "accuracy": 0.9425,
            "cohen_kappa": 0.885
          },
          {
            "provider": "vertex",
            "model": "gemini-3.1-flash-lite-preview",
            "run_stem": "sentiment__vertex__gemini31flashlitepreview__2026-04-29-23-18",
            "metrics_file": "sentiment__vertex__gemini31flashlitepreview__2026-04-29-23-18__metrics.json",
            "timestamp": "2026-04-29T21:18:08.340230Z",
            "accuracy": 0.9475,
            "cohen_kappa": 0.8952618453865336
          }
        ]
      },
      {
        "group_id": "c0a90aa5acad9608",
        "representative_policy": "latest",
        "task_fingerprint": "167cd427ec20c9daf6197b22d9658a19f92d953c7e59d7b9d1c98a7c6a3fb3f6",
        "normalized_tag_key": "disambiguation;homonymy;middle english;semantics",
        "task_name_display": "ME disambiguation",
        "task_names_seen": [
          "ME disambiguation"
        ],
        "tags_display": "Middle English; semantics; disambiguation; homonymy",
        "model_count": 11,
        "alpha_nominal": 0.9580819548074404,
        "pairable_item_count": 400,
        "rated_item_count": 400,
        "fully_shared_item_count": 399,
        "category_count": 5,
        "representative_run_stems": [
          "ME_disambiguation__einfra__deepseekv32thinking__2026-03-29-18-13",
          "ME_disambiguation__einfra__glm5__2026-03-29-21-59",
          "ME_disambiguation__einfra__gptoss120b__2026-03-29-18-48",
          "ME_disambiguation__einfra__kimik25__2026-03-29-19-32",
          "ME_disambiguation__einfra__qwen35__2026-03-29-20-02",
          "ME_disambiguation__openai__gpt54__2026-03-29-18-03",
          "ME_disambiguation__openai__gpt54mini__2026-03-29-18-07",
          "ME_disambiguation__openrouter__qwenqwen36plusfree__2026-04-03-19-16",
          "ME_disambiguation__requesty__anthropicclaudehaiku45__2026-03-29-18-08",
          "ME_disambiguation__vertex__gemini31flashlitepreview__2026-03-29-18-01",
          "ME_disambiguation__vertex__gemini31propreview__2026-03-29-17-50"
        ],
        "representatives": [
          {
            "provider": "e-infra",
            "model": "deepseek-v3.2-thinking",
            "run_stem": "ME_disambiguation__einfra__deepseekv32thinking__2026-03-29-18-13",
            "metrics_file": "ME_disambiguation__einfra__deepseekv32thinking__2026-03-29-18-13__metrics.json",
            "timestamp": "2026-03-29T16:13:50.205096Z",
            "accuracy": 0.99,
            "cohen_kappa": 0.9864222674813306
          },
          {
            "provider": "e-infra",
            "model": "glm-5",
            "run_stem": "ME_disambiguation__einfra__glm5__2026-03-29-21-59",
            "metrics_file": "ME_disambiguation__einfra__glm5__2026-03-29-21-59__metrics.json",
            "timestamp": "2026-03-29T19:59:20.519323Z",
            "accuracy": 0.98,
            "cohen_kappa": 0.9728539798610463
          },
          {
            "provider": "e-infra",
            "model": "gpt-oss-120b",
            "run_stem": "ME_disambiguation__einfra__gptoss120b__2026-03-29-18-48",
            "metrics_file": "ME_disambiguation__einfra__gptoss120b__2026-03-29-18-48__metrics.json",
            "timestamp": "2026-03-29T16:48:51.685631Z",
            "accuracy": 0.945,
            "cohen_kappa": 0.9254767792418956
          },
          {
            "provider": "e-infra",
            "model": "kimi-k2.5",
            "run_stem": "ME_disambiguation__einfra__kimik25__2026-03-29-19-32",
            "metrics_file": "ME_disambiguation__einfra__kimik25__2026-03-29-19-32__metrics.json",
            "timestamp": "2026-03-29T17:32:26.587695Z",
            "accuracy": 0.99,
            "cohen_kappa": 0.9863975651641643
          },
          {
            "provider": "e-infra",
            "model": "qwen3.5",
            "run_stem": "ME_disambiguation__einfra__qwen35__2026-03-29-20-02",
            "metrics_file": "ME_disambiguation__einfra__qwen35__2026-03-29-20-02__metrics.json",
            "timestamp": "2026-03-29T18:02:10.650671Z",
            "accuracy": 0.9925,
            "cohen_kappa": 0.9897961786689115
          },
          {
            "provider": "openai",
            "model": "gpt-5.4",
            "run_stem": "ME_disambiguation__openai__gpt54__2026-03-29-18-03",
            "metrics_file": "ME_disambiguation__openai__gpt54__2026-03-29-18-03__metrics.json",
            "timestamp": "2026-03-29T16:03:41.707544Z",
            "accuracy": 0.9825,
            "cohen_kappa": 0.9761712267563084
          },
          {
            "provider": "openai",
            "model": "gpt-5.4-mini",
            "run_stem": "ME_disambiguation__openai__gpt54mini__2026-03-29-18-07",
            "metrics_file": "ME_disambiguation__openai__gpt54mini__2026-03-29-18-07__metrics.json",
            "timestamp": "2026-03-29T16:07:03.773591Z",
            "accuracy": 0.9525,
            "cohen_kappa": 0.9356319502672121
          },
          {
            "provider": "openrouter",
            "model": "qwen3.6",
            "run_stem": "ME_disambiguation__openrouter__qwenqwen36plusfree__2026-04-03-19-16",
            "metrics_file": "ME_disambiguation__openrouter__qwenqwen36plusfree__2026-04-03-19-16__metrics.json",
            "timestamp": "2026-04-03T17:16:36.765533Z",
            "accuracy": 0.99,
            "cohen_kappa": 0.986392706490679
          },
          {
            "provider": "requesty",
            "model": "claude-haiku-4-5",
            "run_stem": "ME_disambiguation__requesty__anthropicclaudehaiku45__2026-03-29-18-08",
            "metrics_file": "ME_disambiguation__requesty__anthropicclaudehaiku45__2026-03-29-18-08__metrics.json",
            "timestamp": "2026-03-29T16:09:02.369257Z",
            "accuracy": 0.975,
            "cohen_kappa": 0.9660804233163169
          },
          {
            "provider": "vertex",
            "model": "gemini-3.1-flash-lite-preview",
            "run_stem": "ME_disambiguation__vertex__gemini31flashlitepreview__2026-03-29-18-01",
            "metrics_file": "ME_disambiguation__vertex__gemini31flashlitepreview__2026-03-29-18-01__metrics.json",
            "timestamp": "2026-03-29T16:01:28.629386Z",
            "accuracy": 0.9925,
            "cohen_kappa": 0.9897796666467941
          },
          {
            "provider": "vertex",
            "model": "gemini-3.1-pro-preview",
            "run_stem": "ME_disambiguation__vertex__gemini31propreview__2026-03-29-17-50",
            "metrics_file": "ME_disambiguation__vertex__gemini31propreview__2026-03-29-17-50__metrics.json",
            "timestamp": "2026-03-29T15:50:03.914973Z",
            "accuracy": 0.9925,
            "cohen_kappa": 0.989791317515547
          }
        ]
      },
      {
        "group_id": "d8684efc77ade9e8",
        "representative_policy": "latest",
        "task_fingerprint": "3d5a97ff902050847f35f1deb9e78aa16664eb5a88253b16a5d12e0d0981ef3d",
        "normalized_tag_key": "correction;english;error;preposition",
        "task_name_display": "Err. correct. prepositions",
        "task_names_seen": [
          "Err. correct. prepositions"
        ],
        "tags_display": "error; correction; preposition; English",
        "model_count": 5,
        "alpha_nominal": 0.8911470744931334,
        "pairable_item_count": 1283,
        "rated_item_count": 1283,
        "fully_shared_item_count": 1283,
        "category_count": 13,
        "representative_run_stems": [
          "prepositions__einfra__deepseekv32thinking__2026-04-02-13-14",
          "prepositions__einfra__kimik25__2026-04-01-13-10",
          "prepositions__einfra__qwen35__2026-04-04-12-31",
          "prepositions__openrouter__qwenqwen36plusfree__2026-04-03-02-19",
          "prepositions__vertex__gemini3flashpreview__2026-04-01-12-27"
        ],
        "representatives": [
          {
            "provider": "e-infra",
            "model": "deepseek-v3.2-thinking",
            "run_stem": "prepositions__einfra__deepseekv32thinking__2026-04-02-13-14",
            "metrics_file": "prepositions__einfra__deepseekv32thinking__2026-04-02-13-14__metrics.json",
            "timestamp": "2026-04-02T11:14:24.260967Z",
            "accuracy": 0.8924395946999221,
            "cohen_kappa": 0.7595338824791694
          },
          {
            "provider": "e-infra",
            "model": "kimi-k2.5",
            "run_stem": "prepositions__einfra__kimik25__2026-04-01-13-10",
            "metrics_file": "prepositions__einfra__kimik25__2026-04-01-13-10__metrics.json",
            "timestamp": "2026-04-01T11:10:43.580812Z",
            "accuracy": 0.9041309431021044,
            "cohen_kappa": 0.7827007734474937
          },
          {
            "provider": "e-infra",
            "model": "qwen3.5",
            "run_stem": "prepositions__einfra__qwen35__2026-04-04-12-31",
            "metrics_file": "prepositions__einfra__qwen35__2026-04-04-12-31__metrics.json",
            "timestamp": "2026-04-04T10:31:49.198333Z",
            "accuracy": 0.9033515198752923,
            "cohen_kappa": 0.7803725157791852
          },
          {
            "provider": "openrouter",
            "model": "qwen3.6",
            "run_stem": "prepositions__openrouter__qwenqwen36plusfree__2026-04-03-02-19",
            "metrics_file": "prepositions__openrouter__qwenqwen36plusfree__2026-04-03-02-19__metrics.json",
            "timestamp": "2026-04-03T00:19:27.374136Z",
            "accuracy": 0.9033515198752923,
            "cohen_kappa": 0.7755667190041489
          },
          {
            "provider": "vertex",
            "model": "gemini-3-flash-preview",
            "run_stem": "prepositions__vertex__gemini3flashpreview__2026-04-01-12-27",
            "metrics_file": "prepositions__vertex__gemini3flashpreview__2026-04-01-12-27__metrics.json",
            "timestamp": "2026-04-01T10:27:52.038487Z",
            "accuracy": 0.9025720966484801,
            "cohen_kappa": 0.7752016001861465
          }
        ]
      },
      {
        "group_id": "bbb4c11dc6f1f20a",
        "representative_policy": "latest",
        "task_fingerprint": "0eb7fe892594e382024c3813de8ec46058d12d33d835017ca9df11d4d238d61e",
        "normalized_tag_key": "*-ing*;adverbial;clause;english;syntax",
        "task_name_display": "ADV *-ing* clause",
        "task_names_seen": [
          "ADV *-ing* clause"
        ],
        "tags_display": "*-ing*; syntax; adverbial; clause; English",
        "model_count": 16,
        "alpha_nominal": 0.8775809649113769,
        "pairable_item_count": 318,
        "rated_item_count": 318,
        "fully_shared_item_count": 144,
        "category_count": 6,
        "representative_run_stems": [
          "adv-ing__einfra__deepseekv32thinking__2026-03-12-01-08",
          "adv-ing__einfra__glm47__2026-03-17-13-00",
          "adv-ing__einfra__glm5__2026-03-30-21-43",
          "adv-ing__einfra__kimik25__2026-03-19-15-11",
          "adv-ing__einfra__qwen35__2026-03-20-15-45",
          "adv-ing__google__modelsgemini3flashpreview__2026-03-11-15-40",
          "adv-ing__vertex__gemini31propreview__2026-03-11-10-43",
          "adv-ing__google__modelsgemma426ba4bit__2026-04-05-01-59",
          "adv-ing__openai__gpt54__2026-03-11-16-26",
          "adv-ing__openai__gpt54mini__2026-03-18-16-49",
          "adv-ing__openai__gpt54pro__2026-03-12-00-22",
          "adv-ing____gptoss120b__2025-11-08-23-28",
          "adv-ing____gpt5__2025-11-09-00-09",
          "adv-ing__openrouter__qwenqwen36plusfree__2026-04-03-14-47",
          "adv-ing__requesty__anthropicclaudehaiku45__2026-03-23-00-13",
          "adv-ing__requesty__anthropicclaudesonnet46__2026-03-11-16-11"
        ],
        "representatives": [
          {
            "provider": "e-infra",
            "model": "deepseek-v3.2-thinking",
            "run_stem": "adv-ing__einfra__deepseekv32thinking__2026-03-12-01-08",
            "metrics_file": "adv-ing__einfra__deepseekv32thinking__2026-03-12-01-08__metrics.json",
            "timestamp": "2026-03-12T10:15:45.563016Z",
            "accuracy": 0.9559748427672956,
            "cohen_kappa": 0.9071687726760916
          },
          {
            "provider": "e-infra",
            "model": "glm-4.7",
            "run_stem": "adv-ing__einfra__glm47__2026-03-17-13-00",
            "metrics_file": "adv-ing__einfra__glm47__2026-03-17-13-00__metrics.json",
            "timestamp": "2026-03-17T12:00:44.101281Z",
            "accuracy": 0.940251572327044,
            "cohen_kappa": 0.8767894286063869
          },
          {
            "provider": "e-infra",
            "model": "glm-5",
            "run_stem": "adv-ing__einfra__glm5__2026-03-30-21-43",
            "metrics_file": "adv-ing__einfra__glm5__2026-03-30-21-43__metrics.json",
            "timestamp": "2026-03-30T19:44:02.491601Z",
            "accuracy": 0.949685534591195,
            "cohen_kappa": 0.895925380461463
          },
          {
            "provider": "e-infra",
            "model": "kimi-k2.5",
            "run_stem": "adv-ing__einfra__kimik25__2026-03-19-15-11",
            "metrics_file": "adv-ing__einfra__kimik25__2026-03-19-15-11__metrics.json",
            "timestamp": "2026-03-19T14:11:22.566473Z",
            "accuracy": 0.9716981132075472,
            "cohen_kappa": 0.9404382843229069
          },
          {
            "provider": "e-infra",
            "model": "qwen3.5",
            "run_stem": "adv-ing__einfra__qwen35__2026-03-20-15-45",
            "metrics_file": "adv-ing__einfra__qwen35__2026-03-20-15-45__metrics.json",
            "timestamp": "2026-03-20T14:45:23.934528Z",
            "accuracy": 0.8113207547169812,
            "cohen_kappa": 0.6568283602223062
          },
          {
            "provider": "google",
            "model": "gemini-3-flash-preview",
            "run_stem": "adv-ing__google__modelsgemini3flashpreview__2026-03-11-15-40",
            "metrics_file": "adv-ing__google__modelsgemini3flashpreview__2026-03-11-15-40__metrics.json",
            "timestamp": "2026-03-11T14:40:24.117684Z",
            "accuracy": 0.9811320754716981,
            "cohen_kappa": 0.9596241747079736
          },
          {
            "provider": "google",
            "model": "gemini-3.1-pro-preview",
            "run_stem": "adv-ing__vertex__gemini31propreview__2026-03-11-10-43",
            "metrics_file": "adv-ing__vertex__gemini31propreview__2026-03-11-10-43__metrics.json",
            "timestamp": "2026-03-11T09:43:27.624333Z",
            "accuracy": 0.9748427672955975,
            "cohen_kappa": 0.9461655662772981
          },
          {
            "provider": "google",
            "model": "gemma-4-26b-a4b-it",
            "run_stem": "adv-ing__google__modelsgemma426ba4bit__2026-04-05-01-59",
            "metrics_file": "adv-ing__google__modelsgemma426ba4bit__2026-04-05-01-59__metrics.json",
            "timestamp": "2026-04-04T23:59:13.642882Z",
            "accuracy": 0.9559748427672956,
            "cohen_kappa": 0.9069825749028457
          },
          {
            "provider": "openai",
            "model": "gpt-5.4",
            "run_stem": "adv-ing__openai__gpt54__2026-03-11-16-26",
            "metrics_file": "adv-ing__openai__gpt54__2026-03-11-16-26__metrics.json",
            "timestamp": "2026-03-11T15:26:11.802357Z",
            "accuracy": 0.9182389937106918,
            "cohen_kappa": 0.8348513902205178
          },
          {
            "provider": "openai",
            "model": "gpt-5.4-mini",
            "run_stem": "adv-ing__openai__gpt54mini__2026-03-18-16-49",
            "metrics_file": "adv-ing__openai__gpt54mini__2026-03-18-16-49__metrics.json",
            "timestamp": "2026-03-18T15:49:58.142001Z",
            "accuracy": 0.7704402515723271,
            "cohen_kappa": 0.6185170577795307
          },
          {
            "provider": "openai",
            "model": "gpt-5.4-pro",
            "run_stem": "adv-ing__openai__gpt54pro__2026-03-12-00-22",
            "metrics_file": "adv-ing__openai__gpt54pro__2026-03-12-00-22__metrics.json",
            "timestamp": "2026-03-11T23:22:39.487962Z",
            "accuracy": 0.8459119496855346,
            "cohen_kappa": 0.7156776877600176
          },
          {
            "provider": "openai",
            "model": "gpt-oss-120b",
            "run_stem": "adv-ing____gptoss120b__2025-11-08-23-28",
            "metrics_file": "adv-ing____gptoss120b__2025-11-08-23-28__metrics.json",
            "timestamp": "2025-11-08T22:28:31.410308Z",
            "accuracy": 0.9308176100628931,
            "cohen_kappa": 0.857107843137255
          },
          {
            "provider": "openai",
            "model": "gpt5",
            "run_stem": "adv-ing____gpt5__2025-11-09-00-09",
            "metrics_file": "adv-ing____gpt5__2025-11-09-00-09__metrics.json",
            "timestamp": "2025-11-08T23:09:42.353960Z",
            "accuracy": 0.9025157232704403,
            "cohen_kappa": 0.8061013748746091
          },
          {
            "provider": "openrouter",
            "model": "qwen3.6",
            "run_stem": "adv-ing__openrouter__qwenqwen36plusfree__2026-04-03-14-47",
            "metrics_file": "adv-ing__openrouter__qwenqwen36plusfree__2026-04-03-14-47__metrics.json",
            "timestamp": "2026-04-03T12:47:18.635821Z",
            "accuracy": 0.9654088050314465,
            "cohen_kappa": 0.9257655822244859
          },
          {
            "provider": "requesty",
            "model": "claude-haiku-4-5",
            "run_stem": "adv-ing__requesty__anthropicclaudehaiku45__2026-03-23-00-13",
            "metrics_file": "adv-ing__requesty__anthropicclaudehaiku45__2026-03-23-00-13__metrics.json",
            "timestamp": "2026-03-22T23:13:14.236246Z",
            "accuracy": 0.4748427672955975,
            "cohen_kappa": 0.28195351478521885
          },
          {
            "provider": "requesty",
            "model": "claude-sonnet-4-6",
            "run_stem": "adv-ing__requesty__anthropicclaudesonnet46__2026-03-11-16-11",
            "metrics_file": "adv-ing__requesty__anthropicclaudesonnet46__2026-03-11-16-11__metrics.json",
            "timestamp": "2026-03-11T15:11:34.763226Z",
            "accuracy": 0.9308176100628931,
            "cohen_kappa": 0.8585638039786512
          }
        ]
      },
      {
        "group_id": "59ce725da15f2d39",
        "representative_policy": "latest",
        "task_fingerprint": "5d2775386b7429a15d795e64be76f5d0829aff4b3b67fc8acae71e4adebccffd",
        "normalized_tag_key": "morphology;number;old english",
        "task_name_display": "OE number",
        "task_names_seen": [
          "OE number"
        ],
        "tags_display": "Old English; morphology; number",
        "model_count": 19,
        "alpha_nominal": 0.8411162419519214,
        "pairable_item_count": 1200,
        "rated_item_count": 1200,
        "fully_shared_item_count": 1097,
        "category_count": 8,
        "representative_run_stems": [
          "OE_number____deepseekv32thinking__2026-02-21-02-26",
          "OE_number____gemini3flashpreview__2026-02-25-01-10",
          "OE_number____gemini31propreview__2026-02-21-02-26",
          "OE_number____glm47__2026-02-22-16-17",
          "OE_number____gptoss120b__2026-02-21-18-49",
          "OE_number____gpt51__2026-02-24-17-09",
          "OE_number____gpt52pro__2026-02-21-02-26",
          "OE_number____gpt5mini__2026-02-24-01-18",
          "OE_number____kimik25__2026-02-21-23-36",
          "OE_number__einfra__glm5__2026-04-02-01-04",
          "OE_number__inception__mercury2__2026-03-04-23-43",
          "OE_number__openai__gpt54mini__2026-03-20-18-21",
          "OE_number__openai__gpt54pro__2026-03-10-15-22",
          "OE_number__openai__qwen35__2026-02-23-02-19",
          "OE_number__openrouter__qwenqwen36plusfree__2026-04-04-00-50",
          "OE_number__requesty__anthropicclaudehaiku45__2026-03-23-01-17",
          "OE_number__requesty__claudesonnet46__2026-02-21-20-17",
          "OE_number__openai__gpt54__2026-03-10-14-52",
          "OE_number__vertex__gemini3flashpreview__2026-03-11-00-23"
        ],
        "representatives": [
          {
            "provider": "",
            "model": "deepseek-v3.2-thinking",
            "run_stem": "OE_number____deepseekv32thinking__2026-02-21-02-26",
            "metrics_file": "OE_number____deepseekv32thinking__2026-02-21-02-26__metrics.json",
            "timestamp": "2026-02-21T01:26:38.907718Z",
            "accuracy": 0.9633333333333334,
            "cohen_kappa": 0.8979509004673386
          },
          {
            "provider": "",
            "model": "gemini-3-flash-preview",
            "run_stem": "OE_number____gemini3flashpreview__2026-02-25-01-10",
            "metrics_file": "OE_number____gemini3flashpreview__2026-02-25-01-10__metrics.json",
            "timestamp": "2026-02-25T00:10:24.527172Z",
            "accuracy": 0.9808333333333333,
            "cohen_kappa": 0.9449738825312014
          },
          {
            "provider": "",
            "model": "gemini-3.1-pro-preview",
            "run_stem": "OE_number____gemini31propreview__2026-02-21-02-26",
            "metrics_file": "OE_number____gemini31propreview__2026-02-21-02-26__metrics.json",
            "timestamp": "2026-02-21T01:26:31.217004Z",
            "accuracy": 0.9825,
            "cohen_kappa": 0.949889040017181
          },
          {
            "provider": "",
            "model": "glm-4.7",
            "run_stem": "OE_number____glm47__2026-02-22-16-17",
            "metrics_file": "OE_number____glm47__2026-02-22-16-17__metrics.json",
            "timestamp": "2026-02-22T15:17:02.084278Z",
            "accuracy": 0.9516666666666667,
            "cohen_kappa": 0.8702485039428794
          },
          {
            "provider": "",
            "model": "gpt-oss-120b",
            "run_stem": "OE_number____gptoss120b__2026-02-21-18-49",
            "metrics_file": "OE_number____gptoss120b__2026-02-21-18-49__metrics.json",
            "timestamp": "2026-02-21T17:49:34.964163Z",
            "accuracy": 0.8766666666666667,
            "cohen_kappa": 0.6989514152334654
          },
          {
            "provider": "",
            "model": "gpt51",
            "run_stem": "OE_number____gpt51__2026-02-24-17-09",
            "metrics_file": "OE_number____gpt51__2026-02-24-17-09__metrics.json",
            "timestamp": "2026-02-24T16:09:02.122628Z",
            "accuracy": 0.9308333333333333,
            "cohen_kappa": 0.8217653816833507
          },
          {
            "provider": "",
            "model": "gpt52pro",
            "run_stem": "OE_number____gpt52pro__2026-02-21-02-26",
            "metrics_file": "OE_number____gpt52pro__2026-02-21-02-26__metrics.json",
            "timestamp": "2026-02-21T01:26:20.586712Z",
            "accuracy": 0.9775,
            "cohen_kappa": 0.9357382563646894
          },
          {
            "provider": "",
            "model": "gpt5mini",
            "run_stem": "OE_number____gpt5mini__2026-02-24-01-18",
            "metrics_file": "OE_number____gpt5mini__2026-02-24-01-18__metrics.json",
            "timestamp": "2026-02-24T00:18:27.101111Z",
            "accuracy": 0.9175,
            "cohen_kappa": 0.7903386348715559
          },
          {
            "provider": "",
            "model": "kimi-k2.5",
            "run_stem": "OE_number____kimik25__2026-02-21-23-36",
            "metrics_file": "OE_number____kimik25__2026-02-21-23-36__metrics.json",
            "timestamp": "2026-02-21T22:37:01.106140Z",
            "accuracy": 0.9666666666666667,
            "cohen_kappa": 0.9075472279577182
          },
          {
            "provider": "e-infra",
            "model": "glm-5",
            "run_stem": "OE_number__einfra__glm5__2026-04-02-01-04",
            "metrics_file": "OE_number__einfra__glm5__2026-04-02-01-04__metrics.json",
            "timestamp": "2026-04-01T23:04:56.944335Z",
            "accuracy": 0.9641666666666666,
            "cohen_kappa": 0.8995307541034676
          },
          {
            "provider": "inception",
            "model": "mercury-2",
            "run_stem": "OE_number__inception__mercury2__2026-03-04-23-43",
            "metrics_file": "OE_number__inception__mercury2__2026-03-04-23-43__metrics.json",
            "timestamp": "2026-03-04T22:49:36.410032Z",
            "accuracy": 0.8308333333333333,
            "cohen_kappa": 0.6136240352496692
          },
          {
            "provider": "openai",
            "model": "gpt-5.4-mini",
            "run_stem": "OE_number__openai__gpt54mini__2026-03-20-18-21",
            "metrics_file": "OE_number__openai__gpt54mini__2026-03-20-18-21__metrics.json",
            "timestamp": "2026-03-20T17:21:04.759848Z",
            "accuracy": 0.9375,
            "cohen_kappa": 0.8335491030146107
          },
          {
            "provider": "openai",
            "model": "gpt-5.4-pro",
            "run_stem": "OE_number__openai__gpt54pro__2026-03-10-15-22",
            "metrics_file": "OE_number__openai__gpt54pro__2026-03-10-15-22__metrics.json",
            "timestamp": "2026-03-10T14:22:37.561603Z",
            "accuracy": 0.9791666666666666,
            "cohen_kappa": 0.9400331017278462
          },
          {
            "provider": "openai",
            "model": "qwen3.5",
            "run_stem": "OE_number__openai__qwen35__2026-02-23-02-19",
            "metrics_file": "OE_number__openai__qwen35__2026-02-23-02-19__metrics.json",
            "timestamp": "2026-02-23T01:19:56.706842Z",
            "accuracy": 0.9708333333333333,
            "cohen_kappa": 0.9179706725623618
          },
          {
            "provider": "openrouter",
            "model": "qwen3.6",
            "run_stem": "OE_number__openrouter__qwenqwen36plusfree__2026-04-04-00-50",
            "metrics_file": "OE_number__openrouter__qwenqwen36plusfree__2026-04-04-00-50__metrics.json",
            "timestamp": "2026-04-03T22:50:47.255105Z",
            "accuracy": 0.96,
            "cohen_kappa": 0.8909735156498432
          },
          {
            "provider": "requesty",
            "model": "claude-haiku-4-5",
            "run_stem": "OE_number__requesty__anthropicclaudehaiku45__2026-03-23-01-17",
            "metrics_file": "OE_number__requesty__anthropicclaudehaiku45__2026-03-23-01-17__metrics.json",
            "timestamp": "2026-03-23T00:17:54.157008Z",
            "accuracy": 0.9475,
            "cohen_kappa": 0.84400406084667
          },
          {
            "provider": "requesty",
            "model": "claude-sonnet-4-6",
            "run_stem": "OE_number__requesty__claudesonnet46__2026-02-21-20-17",
            "metrics_file": "OE_number__requesty__claudesonnet46__2026-02-21-20-17__metrics.json",
            "timestamp": "2026-02-21T19:17:16.089500Z",
            "accuracy": 0.97,
            "cohen_kappa": 0.9159480745883012
          },
          {
            "provider": "requesty",
            "model": "gpt-5.4-pro",
            "run_stem": "OE_number__openai__gpt54__2026-03-10-14-52",
            "metrics_file": "OE_number__openai__gpt54__2026-03-10-14-52__metrics.json",
            "timestamp": "2026-03-10T13:52:04.694015Z",
            "accuracy": 0.9491666666666667,
            "cohen_kappa": 0.8620278884612341
          },
          {
            "provider": "vertex",
            "model": "gemini-3-flash-preview",
            "run_stem": "OE_number__vertex__gemini3flashpreview__2026-03-11-00-23",
            "metrics_file": "OE_number__vertex__gemini3flashpreview__2026-03-11-00-23__metrics.json",
            "timestamp": "2026-03-10T23:24:01.576132Z",
            "accuracy": 0.9783333333333334,
            "cohen_kappa": 0.937877315662881
          }
        ]
      },
      {
        "group_id": "55424975a19f10e2",
        "representative_policy": "latest",
        "task_fingerprint": "7326c5be7cb61f6e9585d417047cb29a4d0b0ca63102404c9692e99eeae036b2",
        "normalized_tag_key": "*like*;discourse;english;pragmatics;v2",
        "task_name_display": "*like* discourse/pragm",
        "task_names_seen": [
          "*like* discourse/pragm"
        ],
        "tags_display": "*like*; discourse; pragmatics; English; v2",
        "model_count": 4,
        "alpha_nominal": 0.8315709379878387,
        "pairable_item_count": 990,
        "rated_item_count": 993,
        "fully_shared_item_count": 984,
        "category_count": 4,
        "representative_run_stems": [
          "like__vertex__gemini3flashpreview__2026-03-13-02-32",
          "like__vertex__gemini3propreview__2026-03-13-03-28",
          "like__vertex__gemini31flashlitepreview__2026-03-13-03-29",
          "like__vertex__gemini31propreview__2026-03-13-12-02"
        ],
        "representatives": [
          {
            "provider": "vertex",
            "model": "gemini-3-flash-preview",
            "run_stem": "like__vertex__gemini3flashpreview__2026-03-13-02-32",
            "metrics_file": "like__vertex__gemini3flashpreview__2026-03-13-02-32__metrics.json",
            "timestamp": "2026-03-13T01:32:26.042657Z",
            "accuracy": 0.908908908908909,
            "cohen_kappa": 0.8684070726005488
          },
          {
            "provider": "vertex",
            "model": "gemini-3-pro-preview",
            "run_stem": "like__vertex__gemini3propreview__2026-03-13-03-28",
            "metrics_file": "like__vertex__gemini3propreview__2026-03-13-03-28__metrics.json",
            "timestamp": "2026-03-13T02:29:02.578184Z",
            "accuracy": 0.9019019019019019,
            "cohen_kappa": 0.8586077115714009
          },
          {
            "provider": "vertex",
            "model": "gemini-3.1-flash-lite-preview",
            "run_stem": "like__vertex__gemini31flashlitepreview__2026-03-13-03-29",
            "metrics_file": "like__vertex__gemini31flashlitepreview__2026-03-13-03-29__metrics.json",
            "timestamp": "2026-03-13T02:29:25.545069Z",
            "accuracy": 0.7877877877877878,
            "cohen_kappa": 0.6900339840354066
          },
          {
            "provider": "vertex",
            "model": "gemini-3.1-pro-preview",
            "run_stem": "like__vertex__gemini31propreview__2026-03-13-12-02",
            "metrics_file": "like__vertex__gemini31propreview__2026-03-13-12-02__metrics.json",
            "timestamp": "2026-03-13T11:02:34.724184Z",
            "accuracy": 0.914914914914915,
            "cohen_kappa": 0.876989156966848
          }
        ]
      },
      {
        "group_id": "bc91b92d621b9f2f",
        "representative_policy": "latest",
        "task_fingerprint": "e0860bb13f8109f7854720ca263054673c978e0e327536fcf1d0e1226f2388db",
        "normalized_tag_key": "lemmatization;morphology;old english;v3",
        "task_name_display": "OE lemmatization",
        "task_names_seen": [
          "OE lemmatization"
        ],
        "tags_display": "Old English; lemmatization; morphology; v3",
        "model_count": 8,
        "alpha_nominal": 0.7831469880735121,
        "pairable_item_count": 2844,
        "rated_item_count": 2846,
        "fully_shared_item_count": 2819,
        "category_count": 1237,
        "representative_run_stems": [
          "ycoe3__einfra__deepseekv32thinking__2026-03-18-23-50",
          "ycoe3__einfra__glm47__2026-03-20-10-02",
          "ycoe3__einfra__glm5__2026-03-31-20-38",
          "ycoe3__einfra__gptoss120b__2026-03-20-15-32",
          "ycoe3__einfra__kimik25__2026-03-19-21-22",
          "ycoe3__openai__gpt54mini__2026-03-18-17-04",
          "ycoe3__requesty__anthropicclaudehaiku45__2026-03-23-01-29",
          "ycoe3__vertex__gemini3flashpreview__2026-03-18-01-11"
        ],
        "representatives": [
          {
            "provider": "e-infra",
            "model": "deepseek-v3.2-thinking",
            "run_stem": "ycoe3__einfra__deepseekv32thinking__2026-03-18-23-50",
            "metrics_file": "ycoe3__einfra__deepseekv32thinking__2026-03-18-23-50__metrics.json",
            "timestamp": "2026-03-18T22:50:33.957474Z",
            "accuracy": 0.809961144471918,
            "cohen_kappa": 0.8067742518245767
          },
          {
            "provider": "e-infra",
            "model": "glm-4.7",
            "run_stem": "ycoe3__einfra__glm47__2026-03-20-10-02",
            "metrics_file": "ycoe3__einfra__glm47__2026-03-20-10-02__metrics.json",
            "timestamp": "2026-03-20T09:02:57.831985Z",
            "accuracy": 0.8212645708230307,
            "cohen_kappa": 0.8182681322919435
          },
          {
            "provider": "e-infra",
            "model": "glm-5",
            "run_stem": "ycoe3__einfra__glm5__2026-03-31-20-38",
            "metrics_file": "ycoe3__einfra__glm5__2026-03-31-20-38__metrics.json",
            "timestamp": "2026-03-31T18:38:48.544821Z",
            "accuracy": 0.8205581066760862,
            "cohen_kappa": 0.8175208636693767
          },
          {
            "provider": "e-infra",
            "model": "gpt-oss-120b",
            "run_stem": "ycoe3__einfra__gptoss120b__2026-03-20-15-32",
            "metrics_file": "ycoe3__einfra__gptoss120b__2026-03-20-15-32__metrics.json",
            "timestamp": "2026-03-20T14:32:56.669315Z",
            "accuracy": 0.7004592016955139,
            "cohen_kappa": 0.6961615957069811
          },
          {
            "provider": "e-infra",
            "model": "kimi-k2.5",
            "run_stem": "ycoe3__einfra__kimik25__2026-03-19-21-22",
            "metrics_file": "ycoe3__einfra__kimik25__2026-03-19-21-22__metrics.json",
            "timestamp": "2026-03-19T20:22:24.560641Z",
            "accuracy": 0.8265630519251148,
            "cohen_kappa": 0.8235506107595435
          },
          {
            "provider": "openai",
            "model": "gpt-5.4-mini",
            "run_stem": "ycoe3__openai__gpt54mini__2026-03-18-17-04",
            "metrics_file": "ycoe3__openai__gpt54mini__2026-03-18-17-04__metrics.json",
            "timestamp": "2026-03-18T16:04:56.311644Z",
            "accuracy": 0.6944542564464854,
            "cohen_kappa": 0.6896664555799817
          },
          {
            "provider": "requesty",
            "model": "claude-haiku-4-5",
            "run_stem": "ycoe3__requesty__anthropicclaudehaiku45__2026-03-23-01-29",
            "metrics_file": "ycoe3__requesty__anthropicclaudehaiku45__2026-03-23-01-29__metrics.json",
            "timestamp": "2026-03-23T00:29:26.469275Z",
            "accuracy": 0.6937477922995408,
            "cohen_kappa": 0.6901560734270713
          },
          {
            "provider": "vertex",
            "model": "gemini-3-flash-preview",
            "run_stem": "ycoe3__vertex__gemini3flashpreview__2026-03-18-01-11",
            "metrics_file": "ycoe3__vertex__gemini3flashpreview__2026-03-18-01-11__metrics.json",
            "timestamp": "2026-03-18T00:11:38.185791Z",
            "accuracy": 0.8322147651006712,
            "cohen_kappa": 0.8292449559819026
          }
        ]
      },
      {
        "group_id": "cc0c9725606971a1",
        "representative_policy": "latest",
        "task_fingerprint": "9911e304116a3387591af8bb222b9d5c5983c625188825c9007ceff21f5910bc",
        "normalized_tag_key": "*like*;discourse;english;pragmatics;v3",
        "task_name_display": "*like* discourse/pragm",
        "task_names_seen": [
          "*like* discourse/pragm"
        ],
        "tags_display": "*like*; discourse; pragmatics; English; v3",
        "model_count": 2,
        "alpha_nominal": 0.7537922987164527,
        "pairable_item_count": 106,
        "rated_item_count": 111,
        "fully_shared_item_count": 106,
        "category_count": 4,
        "representative_run_stems": [
          "like_interrater__einfra__deepseekv32thinking__2026-03-13-20-58",
          "like_interrater__vertex__gemini31propreview"
        ],
        "representatives": [
          {
            "provider": "e-infra",
            "model": "deepseek-v3.2-thinking",
            "run_stem": "like_interrater__einfra__deepseekv32thinking__2026-03-13-20-58",
            "metrics_file": "like_interrater__einfra__deepseekv32thinking__2026-03-13-20-58__metrics.json",
            "timestamp": "2026-03-13T19:58:49.743838Z",
            "accuracy": 0.7130434782608696,
            "cohen_kappa": 0.6246290801186943
          },
          {
            "provider": "vertex",
            "model": "gemini-3.1-pro-preview",
            "run_stem": "like_interrater__vertex__gemini31propreview",
            "metrics_file": "like_interrater__vertex__gemini31propreview__metrics.json",
            "timestamp": "2026-03-13T17:50:24.766352Z",
            "accuracy": 0.8260869565217391,
            "cohen_kappa": 0.7721644378405151
          }
        ]
      },
      {
        "group_id": "47ee6582d2d2f0c4",
        "representative_policy": "latest",
        "task_fingerprint": "cd60855ae86dc7a8391248627a68bbc5a77fabf2d75d0b967118c09aec3d8faa",
        "normalized_tag_key": "*like*;discourse;english;pragmatics;v3",
        "task_name_display": "*like* discourse/pragm",
        "task_names_seen": [
          "*like* discourse/pragm"
        ],
        "tags_display": "*like*; discourse; pragmatics; English; v3",
        "model_count": 17,
        "alpha_nominal": 0.7249902898309141,
        "pairable_item_count": 115,
        "rated_item_count": 115,
        "fully_shared_item_count": 94,
        "category_count": 14,
        "representative_run_stems": [
          "like_interrater__einfra__deepseekv32thinking__2026-03-16-23-13",
          "like_interrater__einfra__glm47__2026-03-17-00-08",
          "like_interrater__einfra__glm5__2026-03-31-17-53",
          "like_interrater__einfra__glm51__2026-04-24-16-04",
          "like_interrater__einfra__kimik25__2026-03-19-14-20",
          "like_interrater__einfra__kimik26__2026-04-21-18-06",
          "like_interrater__einfra__qwen35__2026-03-16-23-35",
          "like_interrater__google__modelsgemma426ba4bit__2026-04-04-01-49",
          "like_interrater__openai__gpt54__2026-03-16-23-16",
          "like_interrater__openai__gpt54mini__2026-03-20-18-10",
          "like_interrater__openrouter__qwenqwen36plusfree__2026-04-03-16-35",
          "like_interrater__requesty__anthropicclaudehaiku45__2026-03-23-00-11",
          "like_interrater__requesty__anthropicclaudeopus46__2026-03-21-02-24",
          "like_interrater__requesty__nebiuszaiorgglm47__2026-03-17-15-44",
          "like_interrater__requesty__moonshotkimik25__2026-03-17-15-43",
          "like_interrater__vertex__gemini3flashpreview__2026-03-16-23-22",
          "like_interrater__vertex__gemini31propreview__2026-03-16-23-17"
        ],
        "representatives": [
          {
            "provider": "e-infra",
            "model": "deepseek-v3.2-thinking",
            "run_stem": "like_interrater__einfra__deepseekv32thinking__2026-03-16-23-13",
            "metrics_file": "like_interrater__einfra__deepseekv32thinking__2026-03-16-23-13__metrics.json",
            "timestamp": "2026-03-16T22:13:59.292606Z",
            "accuracy": 0.7565217391304347,
            "cohen_kappa": 0.6858536585365853
          },
          {
            "provider": "e-infra",
            "model": "glm-4.7",
            "run_stem": "like_interrater__einfra__glm47__2026-03-17-00-08",
            "metrics_file": "like_interrater__einfra__glm47__2026-03-17-00-08__metrics.json",
            "timestamp": "2026-03-16T23:08:13.298000Z",
            "accuracy": 0.8260869565217391,
            "cohen_kappa": 0.7717121588089332
          },
          {
            "provider": "e-infra",
            "model": "glm-5",
            "run_stem": "like_interrater__einfra__glm5__2026-03-31-17-53",
            "metrics_file": "like_interrater__einfra__glm5__2026-03-31-17-53__metrics.json",
            "timestamp": "2026-03-31T15:53:31.126778Z",
            "accuracy": 0.8869565217391304,
            "cohen_kappa": 0.8496732026143791
          },
          {
            "provider": "e-infra",
            "model": "glm-5.1",
            "run_stem": "like_interrater__einfra__glm51__2026-04-24-16-04",
            "metrics_file": "like_interrater__einfra__glm51__2026-04-24-16-04__metrics.json",
            "timestamp": "2026-04-24T14:04:41.439134Z",
            "accuracy": 0.8782608695652174,
            "cohen_kappa": 0.8386773547094188
          },
          {
            "provider": "e-infra",
            "model": "kimi-k2.5",
            "run_stem": "like_interrater__einfra__kimik25__2026-03-19-14-20",
            "metrics_file": "like_interrater__einfra__kimik25__2026-03-19-14-20__metrics.json",
            "timestamp": "2026-03-19T13:20:12.743909Z",
            "accuracy": 0.8434782608695652,
            "cohen_kappa": 0.7933100349475787
          },
          {
            "provider": "e-infra",
            "model": "kimi-k2.6",
            "run_stem": "like_interrater__einfra__kimik26__2026-04-21-18-06",
            "metrics_file": "like_interrater__einfra__kimik26__2026-04-21-18-06__metrics.json",
            "timestamp": "2026-04-25T10:26:44.480602Z",
            "accuracy": 0.808695652173913,
            "cohen_kappa": 0.7486338797814208
          },
          {
            "provider": "e-infra",
            "model": "qwen3.5",
            "run_stem": "like_interrater__einfra__qwen35__2026-03-16-23-35",
            "metrics_file": "like_interrater__einfra__qwen35__2026-03-16-23-35__metrics.json",
            "timestamp": "2026-03-16T22:35:09.822527Z",
            "accuracy": 0.8956521739130435,
            "cohen_kappa": 0.8637037037037038
          },
          {
            "provider": "google",
            "model": "gemma-4-26b-a4b-it",
            "run_stem": "like_interrater__google__modelsgemma426ba4bit__2026-04-04-01-49",
            "metrics_file": "like_interrater__google__modelsgemma426ba4bit__2026-04-04-01-49__metrics.json",
            "timestamp": "2026-04-03T23:49:56.734262Z",
            "accuracy": 0.8173913043478261,
            "cohen_kappa": 0.7577733199598797
          },
          {
            "provider": "openai",
            "model": "gpt-5.4",
            "run_stem": "like_interrater__openai__gpt54__2026-03-16-23-16",
            "metrics_file": "like_interrater__openai__gpt54__2026-03-16-23-16__metrics.json",
            "timestamp": "2026-03-16T22:16:19.942954Z",
            "accuracy": 0.6869565217391305,
            "cohen_kappa": 0.6090651558073654
          },
          {
            "provider": "openai",
            "model": "gpt-5.4-mini",
            "run_stem": "like_interrater__openai__gpt54mini__2026-03-20-18-10",
            "metrics_file": "like_interrater__openai__gpt54mini__2026-03-20-18-10__metrics.json",
            "timestamp": "2026-03-20T17:11:01.726403Z",
            "accuracy": 0.6086956521739131,
            "cohen_kappa": 0.48964497041420124
          },
          {
            "provider": "openrouter",
            "model": "qwen3.6",
            "run_stem": "like_interrater__openrouter__qwenqwen36plusfree__2026-04-03-16-35",
            "metrics_file": "like_interrater__openrouter__qwenqwen36plusfree__2026-04-03-16-35__metrics.json",
            "timestamp": "2026-04-03T14:35:02.774817Z",
            "accuracy": 0.8869565217391304,
            "cohen_kappa": 0.8521266073194856
          },
          {
            "provider": "requesty",
            "model": "claude-haiku-4-5",
            "run_stem": "like_interrater__requesty__anthropicclaudehaiku45__2026-03-23-00-11",
            "metrics_file": "like_interrater__requesty__anthropicclaudehaiku45__2026-03-23-00-11__metrics.json",
            "timestamp": "2026-03-22T23:11:59.214488Z",
            "accuracy": 0.5478260869565217,
            "cohen_kappa": 0.40909090909090906
          },
          {
            "provider": "requesty",
            "model": "claude-opus-4-6",
            "run_stem": "like_interrater__requesty__anthropicclaudeopus46__2026-03-21-02-24",
            "metrics_file": "like_interrater__requesty__anthropicclaudeopus46__2026-03-21-02-24__metrics.json",
            "timestamp": "2026-03-21T01:24:28.930846Z",
            "accuracy": 0.8173913043478261,
            "cohen_kappa": 0.7521806054386866
          },
          {
            "provider": "requesty",
            "model": "glm-4.7",
            "run_stem": "like_interrater__requesty__nebiuszaiorgglm47__2026-03-17-15-44",
            "metrics_file": "like_interrater__requesty__nebiuszaiorgglm47__2026-03-17-15-44__metrics.json",
            "timestamp": "2026-03-17T14:44:34.806095Z",
            "accuracy": 0.8,
            "cohen_kappa": 0.7374689826302729
          },
          {
            "provider": "requesty",
            "model": "kimi-k2.5",
            "run_stem": "like_interrater__requesty__moonshotkimik25__2026-03-17-15-43",
            "metrics_file": "like_interrater__requesty__moonshotkimik25__2026-03-17-15-43__metrics.json",
            "timestamp": "2026-03-17T14:43:35.190186Z",
            "accuracy": 0.8608695652173913,
            "cohen_kappa": 0.817279046673287
          },
          {
            "provider": "vertex",
            "model": "gemini-3-flash-preview",
            "run_stem": "like_interrater__vertex__gemini3flashpreview__2026-03-16-23-22",
            "metrics_file": "like_interrater__vertex__gemini3flashpreview__2026-03-16-23-22__metrics.json",
            "timestamp": "2026-03-16T22:22:38.058348Z",
            "accuracy": 0.9130434782608695,
            "cohen_kappa": 0.8854581673306772
          },
          {
            "provider": "vertex",
            "model": "gemini-3.1-pro-preview",
            "run_stem": "like_interrater__vertex__gemini31propreview__2026-03-16-23-17",
            "metrics_file": "like_interrater__vertex__gemini31propreview__2026-03-16-23-17__metrics.json",
            "timestamp": "2026-03-16T22:17:19.752371Z",
            "accuracy": 0.9130434782608695,
            "cohen_kappa": 0.8857426726279185
          }
        ]
      },
      {
        "group_id": "4c9b09f197f45ed4",
        "representative_policy": "latest",
        "task_fingerprint": "7326c5be7cb61f6e9585d417047cb29a4d0b0ca63102404c9692e99eeae036b2",
        "normalized_tag_key": "*like*;discourse;english;pragmatics;v1",
        "task_name_display": "*like* discourse/pragm",
        "task_names_seen": [
          "*like* discourse/pragm"
        ],
        "tags_display": "*like*; discourse; pragmatics; English; v1",
        "model_count": 6,
        "alpha_nominal": 0.6641252530628283,
        "pairable_item_count": 991,
        "rated_item_count": 995,
        "fully_shared_item_count": 842,
        "category_count": 5,
        "representative_run_stems": [
          "like____gptoss120b__2025-11-09-17-09",
          "like____gpt5__2025-11-09-02-03",
          "like__requesty__anthropicclaudesonnet46__2026-03-12-15-43",
          "like__requesty__novitadeepseekdeepseekv32__2026-03-12-17-27",
          "like__vertex__gemini3flashpreview__2026-03-12-01-31",
          "like__vertex__gemini31propreview__2026-03-12-18-10"
        ],
        "representatives": [
          {
            "provider": "openai",
            "model": "gpt-oss-120b",
            "run_stem": "like____gptoss120b__2025-11-09-17-09",
            "metrics_file": "like____gptoss120b__2025-11-09-17-09__metrics.json",
            "timestamp": "2025-11-09T16:09:03.246413Z",
            "accuracy": 0.6416416416416416,
            "cohen_kappa": 0.5149610698897534
          },
          {
            "provider": "openai",
            "model": "gpt5",
            "run_stem": "like____gpt5__2025-11-09-02-03",
            "metrics_file": "like____gpt5__2025-11-09-02-03__metrics.json",
            "timestamp": "2025-11-09T01:03:38.395018Z",
            "accuracy": 0.8398398398398398,
            "cohen_kappa": 0.7709513457898124
          },
          {
            "provider": "requesty",
            "model": "claude-sonnet-4-6",
            "run_stem": "like__requesty__anthropicclaudesonnet46__2026-03-12-15-43",
            "metrics_file": "like__requesty__anthropicclaudesonnet46__2026-03-12-15-43__metrics.json",
            "timestamp": "2026-03-12T14:43:10.303101Z",
            "accuracy": 0.8228228228228228,
            "cohen_kappa": 0.7478161408052575
          },
          {
            "provider": "requesty",
            "model": "deepseek-v3.2",
            "run_stem": "like__requesty__novitadeepseekdeepseekv32__2026-03-12-17-27",
            "metrics_file": "like__requesty__novitadeepseekdeepseekv32__2026-03-12-17-27__metrics.json",
            "timestamp": "2026-03-12T16:27:57.675261Z",
            "accuracy": 0.6296296296296297,
            "cohen_kappa": 0.46022553031165947
          },
          {
            "provider": "vertex",
            "model": "gemini-3-flash-preview",
            "run_stem": "like__vertex__gemini3flashpreview__2026-03-12-01-31",
            "metrics_file": "like__vertex__gemini3flashpreview__2026-03-12-01-31__metrics.json",
            "timestamp": "2026-03-12T00:31:13.838640Z",
            "accuracy": 0.8838838838838838,
            "cohen_kappa": 0.832653167320839
          },
          {
            "provider": "vertex",
            "model": "gemini-3.1-pro-preview",
            "run_stem": "like__vertex__gemini31propreview__2026-03-12-18-10",
            "metrics_file": "like__vertex__gemini31propreview__2026-03-12-18-10__metrics.json",
            "timestamp": "2026-03-12T17:10:58.353313Z",
            "accuracy": 0.8948948948948949,
            "cohen_kappa": 0.8484215728492612
          }
        ]
      },
      {
        "group_id": "bcd86cbea72bcdfe",
        "representative_policy": "latest",
        "task_fingerprint": "9d6ea446c173cf044401c04094f095a25fb7ceec9caef163c35be7264a0bb349",
        "normalized_tag_key": "*like*;discourse;english;pragmatics;v3",
        "task_name_display": "*like* discourse/pragm",
        "task_names_seen": [
          "*like* discourse/pragm"
        ],
        "tags_display": "*like*; discourse; pragmatics; English; v3",
        "model_count": 3,
        "alpha_nominal": 0.6518785370596594,
        "pairable_item_count": 112,
        "rated_item_count": 112,
        "fully_shared_item_count": 98,
        "category_count": 9,
        "representative_run_stems": [
          "like_interrater__openai__gpt54__2026-03-13-23-51",
          "like_interrater__requesty__anthropicclaudesonnet46__2026-03-13-22-58",
          "like_interrater__vertex__gemini3flashpreview__2026-03-13-15-32"
        ],
        "representatives": [
          {
            "provider": "openai",
            "model": "gpt-5.4",
            "run_stem": "like_interrater__openai__gpt54__2026-03-13-23-51",
            "metrics_file": "like_interrater__openai__gpt54__2026-03-13-23-51__metrics.json",
            "timestamp": "2026-03-13T22:51:21.706190Z",
            "accuracy": 0.5217391304347826,
            "cohen_kappa": 0.4387755102040816
          },
          {
            "provider": "requesty",
            "model": "claude-sonnet-4-6",
            "run_stem": "like_interrater__requesty__anthropicclaudesonnet46__2026-03-13-22-58",
            "metrics_file": "like_interrater__requesty__anthropicclaudesonnet46__2026-03-13-22-58__metrics.json",
            "timestamp": "2026-03-13T21:58:23.211617Z",
            "accuracy": 0.7478260869565218,
            "cohen_kappa": 0.6706172839506173
          },
          {
            "provider": "vertex",
            "model": "gemini-3-flash-preview",
            "run_stem": "like_interrater__vertex__gemini3flashpreview__2026-03-13-15-32",
            "metrics_file": "like_interrater__vertex__gemini3flashpreview__2026-03-13-15-32__metrics.json",
            "timestamp": "2026-03-13T14:32:09.952389Z",
            "accuracy": 0.808695652173913,
            "cohen_kappa": 0.7482587064676617
          }
        ]
      }
    ],
    "best_accuracy": [
      {
        "group_id": "0b905fcaf17fc900",
        "representative_policy": "best_accuracy",
        "task_fingerprint": "eca5b779052ec243fcc76cdc818bae503645042de8cdc27356517265352b7c24",
        "normalized_tag_key": "lemmatization;morphology;old english;v4;validator",
        "task_name_display": "OE lemmatization",
        "task_names_seen": [
          "OE lemmatization"
        ],
        "tags_display": "Old English; lemmatization; morphology; v4; validator",
        "model_count": 3,
        "alpha_nominal": 0.9792224814894694,
        "pairable_item_count": 2846,
        "rated_item_count": 2846,
        "fully_shared_item_count": 2831,
        "category_count": 540,
        "representative_run_stems": [
          "ycoe3__einfra__glm51__2026-04-27-22-51",
          "ycoe3__einfra__kimik26__2026-04-29-01-25",
          "ycoe3__vertex__gemini3flashpreview__2026-04-30-21-56"
        ],
        "representatives": [
          {
            "provider": "e-infra",
            "model": "glm-5.1",
            "run_stem": "ycoe3__einfra__glm51__2026-04-27-22-51",
            "metrics_file": "ycoe3__einfra__glm51__2026-04-27-22-51__metrics.json",
            "timestamp": "2026-04-27T20:51:24.742584Z",
            "accuracy": 0.9739985945186226,
            "cohen_kappa": 0.9735352590737089
          },
          {
            "provider": "e-infra",
            "model": "kimi-k2.6",
            "run_stem": "ycoe3__einfra__kimik26__2026-04-29-01-25",
            "metrics_file": "ycoe3__einfra__kimik26__2026-04-29-01-25__metrics.json",
            "timestamp": "2026-04-28T23:25:53.664337Z",
            "accuracy": 0.9739985945186226,
            "cohen_kappa": 0.973533010784025
          },
          {
            "provider": "vertex",
            "model": "gemini-3-flash-preview",
            "run_stem": "ycoe3__vertex__gemini3flashpreview__2026-04-30-21-56",
            "metrics_file": "ycoe3__vertex__gemini3flashpreview__2026-04-30-21-56__metrics.json",
            "timestamp": "2026-04-30T19:56:09.797765Z",
            "accuracy": 0.9873506676036542,
            "cohen_kappa": 0.9871231511429811
          }
        ]
      },
      {
        "group_id": "a8180ff0cdc9ff74",
        "representative_policy": "best_accuracy",
        "task_fingerprint": "2c0d314be63ddff932b8c577820159b00f71219bdb5eba31e8cbc0736a1243f5",
        "normalized_tag_key": "english;pragmatics;semantics;sentiment analysis",
        "task_name_display": "sentiment analysis",
        "task_names_seen": [
          "sentiment analysis"
        ],
        "tags_display": "sentiment analysis; English; semantics; pragmatics",
        "model_count": 2,
        "alpha_nominal": 0.9649421567580135,
        "pairable_item_count": 399,
        "rated_item_count": 400,
        "fully_shared_item_count": 399,
        "category_count": 2,
        "representative_run_stems": [
          "sentiment__vertex__gemini3flashpreview__2026-04-30-00-04",
          "sentiment__vertex__gemini31flashlitepreview__2026-04-29-23-18"
        ],
        "representatives": [
          {
            "provider": "vertex",
            "model": "gemini-3-flash-preview",
            "run_stem": "sentiment__vertex__gemini3flashpreview__2026-04-30-00-04",
            "metrics_file": "sentiment__vertex__gemini3flashpreview__2026-04-30-00-04__metrics.json",
            "timestamp": "2026-04-29T22:04:02.485410Z",
            "accuracy": 0.9425,
            "cohen_kappa": 0.885
          },
          {
            "provider": "vertex",
            "model": "gemini-3.1-flash-lite-preview",
            "run_stem": "sentiment__vertex__gemini31flashlitepreview__2026-04-29-23-18",
            "metrics_file": "sentiment__vertex__gemini31flashlitepreview__2026-04-29-23-18__metrics.json",
            "timestamp": "2026-04-29T21:18:08.340230Z",
            "accuracy": 0.9475,
            "cohen_kappa": 0.8952618453865336
          }
        ]
      },
      {
        "group_id": "941335089239d5c3",
        "representative_policy": "best_accuracy",
        "task_fingerprint": "167cd427ec20c9daf6197b22d9658a19f92d953c7e59d7b9d1c98a7c6a3fb3f6",
        "normalized_tag_key": "disambiguation;homonymy;middle english;semantics",
        "task_name_display": "ME disambiguation",
        "task_names_seen": [
          "ME disambiguation"
        ],
        "tags_display": "Middle English; semantics; disambiguation; homonymy",
        "model_count": 11,
        "alpha_nominal": 0.9580819548074404,
        "pairable_item_count": 400,
        "rated_item_count": 400,
        "fully_shared_item_count": 399,
        "category_count": 5,
        "representative_run_stems": [
          "ME_disambiguation__einfra__deepseekv32thinking__2026-03-29-18-13",
          "ME_disambiguation__einfra__glm5__2026-03-29-21-59",
          "ME_disambiguation__einfra__gptoss120b__2026-03-29-18-48",
          "ME_disambiguation__einfra__kimik25__2026-03-29-19-32",
          "ME_disambiguation__einfra__qwen35__2026-03-29-20-02",
          "ME_disambiguation__openai__gpt54__2026-03-29-18-03",
          "ME_disambiguation__openai__gpt54mini__2026-03-29-18-07",
          "ME_disambiguation__openrouter__qwenqwen36plusfree__2026-04-03-19-16",
          "ME_disambiguation__requesty__anthropicclaudehaiku45__2026-03-29-18-08",
          "ME_disambiguation__vertex__gemini31flashlitepreview__2026-03-29-18-01",
          "ME_disambiguation__vertex__gemini31propreview__2026-03-29-17-50"
        ],
        "representatives": [
          {
            "provider": "e-infra",
            "model": "deepseek-v3.2-thinking",
            "run_stem": "ME_disambiguation__einfra__deepseekv32thinking__2026-03-29-18-13",
            "metrics_file": "ME_disambiguation__einfra__deepseekv32thinking__2026-03-29-18-13__metrics.json",
            "timestamp": "2026-03-29T16:13:50.205096Z",
            "accuracy": 0.99,
            "cohen_kappa": 0.9864222674813306
          },
          {
            "provider": "e-infra",
            "model": "glm-5",
            "run_stem": "ME_disambiguation__einfra__glm5__2026-03-29-21-59",
            "metrics_file": "ME_disambiguation__einfra__glm5__2026-03-29-21-59__metrics.json",
            "timestamp": "2026-03-29T19:59:20.519323Z",
            "accuracy": 0.98,
            "cohen_kappa": 0.9728539798610463
          },
          {
            "provider": "e-infra",
            "model": "gpt-oss-120b",
            "run_stem": "ME_disambiguation__einfra__gptoss120b__2026-03-29-18-48",
            "metrics_file": "ME_disambiguation__einfra__gptoss120b__2026-03-29-18-48__metrics.json",
            "timestamp": "2026-03-29T16:48:51.685631Z",
            "accuracy": 0.945,
            "cohen_kappa": 0.9254767792418956
          },
          {
            "provider": "e-infra",
            "model": "kimi-k2.5",
            "run_stem": "ME_disambiguation__einfra__kimik25__2026-03-29-19-32",
            "metrics_file": "ME_disambiguation__einfra__kimik25__2026-03-29-19-32__metrics.json",
            "timestamp": "2026-03-29T17:32:26.587695Z",
            "accuracy": 0.99,
            "cohen_kappa": 0.9863975651641643
          },
          {
            "provider": "e-infra",
            "model": "qwen3.5",
            "run_stem": "ME_disambiguation__einfra__qwen35__2026-03-29-20-02",
            "metrics_file": "ME_disambiguation__einfra__qwen35__2026-03-29-20-02__metrics.json",
            "timestamp": "2026-03-29T18:02:10.650671Z",
            "accuracy": 0.9925,
            "cohen_kappa": 0.9897961786689115
          },
          {
            "provider": "openai",
            "model": "gpt-5.4",
            "run_stem": "ME_disambiguation__openai__gpt54__2026-03-29-18-03",
            "metrics_file": "ME_disambiguation__openai__gpt54__2026-03-29-18-03__metrics.json",
            "timestamp": "2026-03-29T16:03:41.707544Z",
            "accuracy": 0.9825,
            "cohen_kappa": 0.9761712267563084
          },
          {
            "provider": "openai",
            "model": "gpt-5.4-mini",
            "run_stem": "ME_disambiguation__openai__gpt54mini__2026-03-29-18-07",
            "metrics_file": "ME_disambiguation__openai__gpt54mini__2026-03-29-18-07__metrics.json",
            "timestamp": "2026-03-29T16:07:03.773591Z",
            "accuracy": 0.9525,
            "cohen_kappa": 0.9356319502672121
          },
          {
            "provider": "openrouter",
            "model": "qwen3.6",
            "run_stem": "ME_disambiguation__openrouter__qwenqwen36plusfree__2026-04-03-19-16",
            "metrics_file": "ME_disambiguation__openrouter__qwenqwen36plusfree__2026-04-03-19-16__metrics.json",
            "timestamp": "2026-04-03T17:16:36.765533Z",
            "accuracy": 0.99,
            "cohen_kappa": 0.986392706490679
          },
          {
            "provider": "requesty",
            "model": "claude-haiku-4-5",
            "run_stem": "ME_disambiguation__requesty__anthropicclaudehaiku45__2026-03-29-18-08",
            "metrics_file": "ME_disambiguation__requesty__anthropicclaudehaiku45__2026-03-29-18-08__metrics.json",
            "timestamp": "2026-03-29T16:09:02.369257Z",
            "accuracy": 0.975,
            "cohen_kappa": 0.9660804233163169
          },
          {
            "provider": "vertex",
            "model": "gemini-3.1-flash-lite-preview",
            "run_stem": "ME_disambiguation__vertex__gemini31flashlitepreview__2026-03-29-18-01",
            "metrics_file": "ME_disambiguation__vertex__gemini31flashlitepreview__2026-03-29-18-01__metrics.json",
            "timestamp": "2026-03-29T16:01:28.629386Z",
            "accuracy": 0.9925,
            "cohen_kappa": 0.9897796666467941
          },
          {
            "provider": "vertex",
            "model": "gemini-3.1-pro-preview",
            "run_stem": "ME_disambiguation__vertex__gemini31propreview__2026-03-29-17-50",
            "metrics_file": "ME_disambiguation__vertex__gemini31propreview__2026-03-29-17-50__metrics.json",
            "timestamp": "2026-03-29T15:50:03.914973Z",
            "accuracy": 0.9925,
            "cohen_kappa": 0.989791317515547
          }
        ]
      },
      {
        "group_id": "532a75904e1d671d",
        "representative_policy": "best_accuracy",
        "task_fingerprint": "3d5a97ff902050847f35f1deb9e78aa16664eb5a88253b16a5d12e0d0981ef3d",
        "normalized_tag_key": "correction;english;error;preposition",
        "task_name_display": "Err. correct. prepositions",
        "task_names_seen": [
          "Err. correct. prepositions"
        ],
        "tags_display": "error; correction; preposition; English",
        "model_count": 5,
        "alpha_nominal": 0.8911470744931334,
        "pairable_item_count": 1283,
        "rated_item_count": 1283,
        "fully_shared_item_count": 1283,
        "category_count": 13,
        "representative_run_stems": [
          "prepositions__einfra__deepseekv32thinking__2026-04-02-13-14",
          "prepositions__einfra__kimik25__2026-04-01-13-10",
          "prepositions__einfra__qwen35__2026-04-04-12-31",
          "prepositions__openrouter__qwenqwen36plusfree__2026-04-03-02-19",
          "prepositions__vertex__gemini3flashpreview__2026-04-01-12-27"
        ],
        "representatives": [
          {
            "provider": "e-infra",
            "model": "deepseek-v3.2-thinking",
            "run_stem": "prepositions__einfra__deepseekv32thinking__2026-04-02-13-14",
            "metrics_file": "prepositions__einfra__deepseekv32thinking__2026-04-02-13-14__metrics.json",
            "timestamp": "2026-04-02T11:14:24.260967Z",
            "accuracy": 0.8924395946999221,
            "cohen_kappa": 0.7595338824791694
          },
          {
            "provider": "e-infra",
            "model": "kimi-k2.5",
            "run_stem": "prepositions__einfra__kimik25__2026-04-01-13-10",
            "metrics_file": "prepositions__einfra__kimik25__2026-04-01-13-10__metrics.json",
            "timestamp": "2026-04-01T11:10:43.580812Z",
            "accuracy": 0.9041309431021044,
            "cohen_kappa": 0.7827007734474937
          },
          {
            "provider": "e-infra",
            "model": "qwen3.5",
            "run_stem": "prepositions__einfra__qwen35__2026-04-04-12-31",
            "metrics_file": "prepositions__einfra__qwen35__2026-04-04-12-31__metrics.json",
            "timestamp": "2026-04-04T10:31:49.198333Z",
            "accuracy": 0.9033515198752923,
            "cohen_kappa": 0.7803725157791852
          },
          {
            "provider": "openrouter",
            "model": "qwen3.6",
            "run_stem": "prepositions__openrouter__qwenqwen36plusfree__2026-04-03-02-19",
            "metrics_file": "prepositions__openrouter__qwenqwen36plusfree__2026-04-03-02-19__metrics.json",
            "timestamp": "2026-04-03T00:19:27.374136Z",
            "accuracy": 0.9033515198752923,
            "cohen_kappa": 0.7755667190041489
          },
          {
            "provider": "vertex",
            "model": "gemini-3-flash-preview",
            "run_stem": "prepositions__vertex__gemini3flashpreview__2026-04-01-12-27",
            "metrics_file": "prepositions__vertex__gemini3flashpreview__2026-04-01-12-27__metrics.json",
            "timestamp": "2026-04-01T10:27:52.038487Z",
            "accuracy": 0.9025720966484801,
            "cohen_kappa": 0.7752016001861465
          }
        ]
      },
      {
        "group_id": "b9abcee6b15c5c5f",
        "representative_policy": "best_accuracy",
        "task_fingerprint": "0eb7fe892594e382024c3813de8ec46058d12d33d835017ca9df11d4d238d61e",
        "normalized_tag_key": "*-ing*;adverbial;clause;english;syntax",
        "task_name_display": "ADV *-ing* clause",
        "task_names_seen": [
          "ADV *-ing* clause"
        ],
        "tags_display": "*-ing*; syntax; adverbial; clause; English",
        "model_count": 16,
        "alpha_nominal": 0.8788765021031747,
        "pairable_item_count": 318,
        "rated_item_count": 318,
        "fully_shared_item_count": 143,
        "category_count": 6,
        "representative_run_stems": [
          "adv-ing__einfra__deepseekv32thinking__2026-03-12-01-08",
          "adv-ing__einfra__glm47__2026-03-17-13-00",
          "adv-ing__einfra__glm5__2026-03-30-21-43",
          "adv-ing__einfra__kimik25__2026-03-19-15-11",
          "adv-ing__einfra__qwen35__2026-03-12-14-37",
          "adv-ing__google__modelsgemini3flashpreview__2026-03-11-15-40",
          "adv-ing__vertex__gemini31propreview__2026-03-11-10-43",
          "adv-ing__google__modelsgemma426ba4bit__2026-04-05-01-59",
          "adv-ing__openai__gpt54__2026-03-11-16-26",
          "adv-ing__openai__gpt54mini__2026-03-18-16-49",
          "adv-ing__openai__gpt54pro__2026-03-12-00-22",
          "adv-ing____gptoss120b__2025-11-08-23-28",
          "adv-ing____gpt5__2025-11-08-17-24",
          "adv-ing__openrouter__qwenqwen36plusfree__2026-04-03-14-47",
          "adv-ing__requesty__anthropicclaudehaiku45__2026-03-23-00-13",
          "adv-ing__requesty__anthropicclaudesonnet46__2026-03-11-16-11"
        ],
        "representatives": [
          {
            "provider": "e-infra",
            "model": "deepseek-v3.2-thinking",
            "run_stem": "adv-ing__einfra__deepseekv32thinking__2026-03-12-01-08",
            "metrics_file": "adv-ing__einfra__deepseekv32thinking__2026-03-12-01-08__metrics.json",
            "timestamp": "2026-03-12T10:15:45.563016Z",
            "accuracy": 0.9559748427672956,
            "cohen_kappa": 0.9071687726760916
          },
          {
            "provider": "e-infra",
            "model": "glm-4.7",
            "run_stem": "adv-ing__einfra__glm47__2026-03-17-13-00",
            "metrics_file": "adv-ing__einfra__glm47__2026-03-17-13-00__metrics.json",
            "timestamp": "2026-03-17T12:00:44.101281Z",
            "accuracy": 0.940251572327044,
            "cohen_kappa": 0.8767894286063869
          },
          {
            "provider": "e-infra",
            "model": "glm-5",
            "run_stem": "adv-ing__einfra__glm5__2026-03-30-21-43",
            "metrics_file": "adv-ing__einfra__glm5__2026-03-30-21-43__metrics.json",
            "timestamp": "2026-03-30T19:44:02.491601Z",
            "accuracy": 0.949685534591195,
            "cohen_kappa": 0.895925380461463
          },
          {
            "provider": "e-infra",
            "model": "kimi-k2.5",
            "run_stem": "adv-ing__einfra__kimik25__2026-03-19-15-11",
            "metrics_file": "adv-ing__einfra__kimik25__2026-03-19-15-11__metrics.json",
            "timestamp": "2026-03-19T14:11:22.566473Z",
            "accuracy": 0.9716981132075472,
            "cohen_kappa": 0.9404382843229069
          },
          {
            "provider": "e-infra",
            "model": "qwen3.5",
            "run_stem": "adv-ing__einfra__qwen35__2026-03-12-14-37",
            "metrics_file": "adv-ing__einfra__qwen35__2026-03-12-14-37__metrics.json",
            "timestamp": "2026-03-12T13:37:29.843039Z",
            "accuracy": 0.8238993710691824,
            "cohen_kappa": 0.670716148000222
          },
          {
            "provider": "google",
            "model": "gemini-3-flash-preview",
            "run_stem": "adv-ing__google__modelsgemini3flashpreview__2026-03-11-15-40",
            "metrics_file": "adv-ing__google__modelsgemini3flashpreview__2026-03-11-15-40__metrics.json",
            "timestamp": "2026-03-11T14:40:24.117684Z",
            "accuracy": 0.9811320754716981,
            "cohen_kappa": 0.9596241747079736
          },
          {
            "provider": "google",
            "model": "gemini-3.1-pro-preview",
            "run_stem": "adv-ing__vertex__gemini31propreview__2026-03-11-10-43",
            "metrics_file": "adv-ing__vertex__gemini31propreview__2026-03-11-10-43__metrics.json",
            "timestamp": "2026-03-11T09:43:27.624333Z",
            "accuracy": 0.9748427672955975,
            "cohen_kappa": 0.9461655662772981
          },
          {
            "provider": "google",
            "model": "gemma-4-26b-a4b-it",
            "run_stem": "adv-ing__google__modelsgemma426ba4bit__2026-04-05-01-59",
            "metrics_file": "adv-ing__google__modelsgemma426ba4bit__2026-04-05-01-59__metrics.json",
            "timestamp": "2026-04-04T23:59:13.642882Z",
            "accuracy": 0.9559748427672956,
            "cohen_kappa": 0.9069825749028457
          },
          {
            "provider": "openai",
            "model": "gpt-5.4",
            "run_stem": "adv-ing__openai__gpt54__2026-03-11-16-26",
            "metrics_file": "adv-ing__openai__gpt54__2026-03-11-16-26__metrics.json",
            "timestamp": "2026-03-11T15:26:11.802357Z",
            "accuracy": 0.9182389937106918,
            "cohen_kappa": 0.8348513902205178
          },
          {
            "provider": "openai",
            "model": "gpt-5.4-mini",
            "run_stem": "adv-ing__openai__gpt54mini__2026-03-18-16-49",
            "metrics_file": "adv-ing__openai__gpt54mini__2026-03-18-16-49__metrics.json",
            "timestamp": "2026-03-18T15:49:58.142001Z",
            "accuracy": 0.7704402515723271,
            "cohen_kappa": 0.6185170577795307
          },
          {
            "provider": "openai",
            "model": "gpt-5.4-pro",
            "run_stem": "adv-ing__openai__gpt54pro__2026-03-12-00-22",
            "metrics_file": "adv-ing__openai__gpt54pro__2026-03-12-00-22__metrics.json",
            "timestamp": "2026-03-11T23:22:39.487962Z",
            "accuracy": 0.8459119496855346,
            "cohen_kappa": 0.7156776877600176
          },
          {
            "provider": "openai",
            "model": "gpt-oss-120b",
            "run_stem": "adv-ing____gptoss120b__2025-11-08-23-28",
            "metrics_file": "adv-ing____gptoss120b__2025-11-08-23-28__metrics.json",
            "timestamp": "2025-11-08T22:28:31.410308Z",
            "accuracy": 0.9308176100628931,
            "cohen_kappa": 0.857107843137255
          },
          {
            "provider": "openai",
            "model": "gpt5",
            "run_stem": "adv-ing____gpt5__2025-11-08-17-24",
            "metrics_file": "adv-ing____gpt5__2025-11-08-17-24__metrics.json",
            "timestamp": "2025-11-08T16:24:26.999467Z",
            "accuracy": 0.9245283018867925,
            "cohen_kappa": 0.8476190476190476
          },
          {
            "provider": "openrouter",
            "model": "qwen3.6",
            "run_stem": "adv-ing__openrouter__qwenqwen36plusfree__2026-04-03-14-47",
            "metrics_file": "adv-ing__openrouter__qwenqwen36plusfree__2026-04-03-14-47__metrics.json",
            "timestamp": "2026-04-03T12:47:18.635821Z",
            "accuracy": 0.9654088050314465,
            "cohen_kappa": 0.9257655822244859
          },
          {
            "provider": "requesty",
            "model": "claude-haiku-4-5",
            "run_stem": "adv-ing__requesty__anthropicclaudehaiku45__2026-03-23-00-13",
            "metrics_file": "adv-ing__requesty__anthropicclaudehaiku45__2026-03-23-00-13__metrics.json",
            "timestamp": "2026-03-22T23:13:14.236246Z",
            "accuracy": 0.4748427672955975,
            "cohen_kappa": 0.28195351478521885
          },
          {
            "provider": "requesty",
            "model": "claude-sonnet-4-6",
            "run_stem": "adv-ing__requesty__anthropicclaudesonnet46__2026-03-11-16-11",
            "metrics_file": "adv-ing__requesty__anthropicclaudesonnet46__2026-03-11-16-11__metrics.json",
            "timestamp": "2026-03-11T15:11:34.763226Z",
            "accuracy": 0.9308176100628931,
            "cohen_kappa": 0.8585638039786512
          }
        ]
      },
      {
        "group_id": "714c54f5e2df639d",
        "representative_policy": "best_accuracy",
        "task_fingerprint": "5d2775386b7429a15d795e64be76f5d0829aff4b3b67fc8acae71e4adebccffd",
        "normalized_tag_key": "morphology;number;old english",
        "task_name_display": "OE number",
        "task_names_seen": [
          "OE number"
        ],
        "tags_display": "Old English; morphology; number",
        "model_count": 19,
        "alpha_nominal": 0.8413996916150354,
        "pairable_item_count": 1200,
        "rated_item_count": 1200,
        "fully_shared_item_count": 1097,
        "category_count": 8,
        "representative_run_stems": [
          "OE_number____deepseekv32thinking__2026-02-21-02-26",
          "OE_number____gemini3flashpreview__2026-02-25-01-10",
          "OE_number____gemini31propreview__2026-02-21-02-26",
          "OE_number____glm47__2026-02-22-16-17",
          "OE_number____gptoss120b__2026-02-21-18-49",
          "OE_number____gpt51__2026-02-24-17-09",
          "OE_number____gpt52pro__2026-02-21-02-26",
          "OE_number____gpt5mini__2026-02-24-01-18",
          "OE_number____kimik25__2026-02-21-23-36",
          "OE_number__einfra__glm5__2026-04-02-01-04",
          "OE_number__inception__mercury2__2026-03-04-23-43",
          "OE_number__openai__gpt54mini__2026-03-20-18-21",
          "OE_number__openai__gpt54pro__2026-03-10-15-22",
          "OE_number__openai__qwen35__2026-02-23-02-19",
          "OE_number__openrouter__qwenqwen36plusfree__2026-04-04-00-50",
          "OE_number__requesty__anthropicclaudehaiku45__2026-03-23-01-17",
          "OE_number__requesty__claudesonnet46__2026-02-21-20-17",
          "OE_number__openai__gpt54__2026-03-10-14-52",
          "OE_number__vertex__gemini3flashpreview__2026-02-27-02-12"
        ],
        "representatives": [
          {
            "provider": "",
            "model": "deepseek-v3.2-thinking",
            "run_stem": "OE_number____deepseekv32thinking__2026-02-21-02-26",
            "metrics_file": "OE_number____deepseekv32thinking__2026-02-21-02-26__metrics.json",
            "timestamp": "2026-02-21T01:26:38.907718Z",
            "accuracy": 0.9633333333333334,
            "cohen_kappa": 0.8979509004673386
          },
          {
            "provider": "",
            "model": "gemini-3-flash-preview",
            "run_stem": "OE_number____gemini3flashpreview__2026-02-25-01-10",
            "metrics_file": "OE_number____gemini3flashpreview__2026-02-25-01-10__metrics.json",
            "timestamp": "2026-02-25T00:10:24.527172Z",
            "accuracy": 0.9808333333333333,
            "cohen_kappa": 0.9449738825312014
          },
          {
            "provider": "",
            "model": "gemini-3.1-pro-preview",
            "run_stem": "OE_number____gemini31propreview__2026-02-21-02-26",
            "metrics_file": "OE_number____gemini31propreview__2026-02-21-02-26__metrics.json",
            "timestamp": "2026-02-21T01:26:31.217004Z",
            "accuracy": 0.9825,
            "cohen_kappa": 0.949889040017181
          },
          {
            "provider": "",
            "model": "glm-4.7",
            "run_stem": "OE_number____glm47__2026-02-22-16-17",
            "metrics_file": "OE_number____glm47__2026-02-22-16-17__metrics.json",
            "timestamp": "2026-02-22T15:17:02.084278Z",
            "accuracy": 0.9516666666666667,
            "cohen_kappa": 0.8702485039428794
          },
          {
            "provider": "",
            "model": "gpt-oss-120b",
            "run_stem": "OE_number____gptoss120b__2026-02-21-18-49",
            "metrics_file": "OE_number____gptoss120b__2026-02-21-18-49__metrics.json",
            "timestamp": "2026-02-21T17:49:34.964163Z",
            "accuracy": 0.8766666666666667,
            "cohen_kappa": 0.6989514152334654
          },
          {
            "provider": "",
            "model": "gpt51",
            "run_stem": "OE_number____gpt51__2026-02-24-17-09",
            "metrics_file": "OE_number____gpt51__2026-02-24-17-09__metrics.json",
            "timestamp": "2026-02-24T16:09:02.122628Z",
            "accuracy": 0.9308333333333333,
            "cohen_kappa": 0.8217653816833507
          },
          {
            "provider": "",
            "model": "gpt52pro",
            "run_stem": "OE_number____gpt52pro__2026-02-21-02-26",
            "metrics_file": "OE_number____gpt52pro__2026-02-21-02-26__metrics.json",
            "timestamp": "2026-02-21T01:26:20.586712Z",
            "accuracy": 0.9775,
            "cohen_kappa": 0.9357382563646894
          },
          {
            "provider": "",
            "model": "gpt5mini",
            "run_stem": "OE_number____gpt5mini__2026-02-24-01-18",
            "metrics_file": "OE_number____gpt5mini__2026-02-24-01-18__metrics.json",
            "timestamp": "2026-02-24T00:18:27.101111Z",
            "accuracy": 0.9175,
            "cohen_kappa": 0.7903386348715559
          },
          {
            "provider": "",
            "model": "kimi-k2.5",
            "run_stem": "OE_number____kimik25__2026-02-21-23-36",
            "metrics_file": "OE_number____kimik25__2026-02-21-23-36__metrics.json",
            "timestamp": "2026-02-21T22:37:01.106140Z",
            "accuracy": 0.9666666666666667,
            "cohen_kappa": 0.9075472279577182
          },
          {
            "provider": "e-infra",
            "model": "glm-5",
            "run_stem": "OE_number__einfra__glm5__2026-04-02-01-04",
            "metrics_file": "OE_number__einfra__glm5__2026-04-02-01-04__metrics.json",
            "timestamp": "2026-04-01T23:04:56.944335Z",
            "accuracy": 0.9641666666666666,
            "cohen_kappa": 0.8995307541034676
          },
          {
            "provider": "inception",
            "model": "mercury-2",
            "run_stem": "OE_number__inception__mercury2__2026-03-04-23-43",
            "metrics_file": "OE_number__inception__mercury2__2026-03-04-23-43__metrics.json",
            "timestamp": "2026-03-04T22:49:36.410032Z",
            "accuracy": 0.8308333333333333,
            "cohen_kappa": 0.6136240352496692
          },
          {
            "provider": "openai",
            "model": "gpt-5.4-mini",
            "run_stem": "OE_number__openai__gpt54mini__2026-03-20-18-21",
            "metrics_file": "OE_number__openai__gpt54mini__2026-03-20-18-21__metrics.json",
            "timestamp": "2026-03-20T17:21:04.759848Z",
            "accuracy": 0.9375,
            "cohen_kappa": 0.8335491030146107
          },
          {
            "provider": "openai",
            "model": "gpt-5.4-pro",
            "run_stem": "OE_number__openai__gpt54pro__2026-03-10-15-22",
            "metrics_file": "OE_number__openai__gpt54pro__2026-03-10-15-22__metrics.json",
            "timestamp": "2026-03-10T14:22:37.561603Z",
            "accuracy": 0.9791666666666666,
            "cohen_kappa": 0.9400331017278462
          },
          {
            "provider": "openai",
            "model": "qwen3.5",
            "run_stem": "OE_number__openai__qwen35__2026-02-23-02-19",
            "metrics_file": "OE_number__openai__qwen35__2026-02-23-02-19__metrics.json",
            "timestamp": "2026-02-23T01:19:56.706842Z",
            "accuracy": 0.9708333333333333,
            "cohen_kappa": 0.9179706725623618
          },
          {
            "provider": "openrouter",
            "model": "qwen3.6",
            "run_stem": "OE_number__openrouter__qwenqwen36plusfree__2026-04-04-00-50",
            "metrics_file": "OE_number__openrouter__qwenqwen36plusfree__2026-04-04-00-50__metrics.json",
            "timestamp": "2026-04-03T22:50:47.255105Z",
            "accuracy": 0.96,
            "cohen_kappa": 0.8909735156498432
          },
          {
            "provider": "requesty",
            "model": "claude-haiku-4-5",
            "run_stem": "OE_number__requesty__anthropicclaudehaiku45__2026-03-23-01-17",
            "metrics_file": "OE_number__requesty__anthropicclaudehaiku45__2026-03-23-01-17__metrics.json",
            "timestamp": "2026-03-23T00:17:54.157008Z",
            "accuracy": 0.9475,
            "cohen_kappa": 0.84400406084667
          },
          {
            "provider": "requesty",
            "model": "claude-sonnet-4-6",
            "run_stem": "OE_number__requesty__claudesonnet46__2026-02-21-20-17",
            "metrics_file": "OE_number__requesty__claudesonnet46__2026-02-21-20-17__metrics.json",
            "timestamp": "2026-02-21T19:17:16.089500Z",
            "accuracy": 0.97,
            "cohen_kappa": 0.9159480745883012
          },
          {
            "provider": "requesty",
            "model": "gpt-5.4-pro",
            "run_stem": "OE_number__openai__gpt54__2026-03-10-14-52",
            "metrics_file": "OE_number__openai__gpt54__2026-03-10-14-52__metrics.json",
            "timestamp": "2026-03-10T13:52:04.694015Z",
            "accuracy": 0.9491666666666667,
            "cohen_kappa": 0.8620278884612341
          },
          {
            "provider": "vertex",
            "model": "gemini-3-flash-preview",
            "run_stem": "OE_number__vertex__gemini3flashpreview__2026-02-27-02-12",
            "metrics_file": "OE_number__vertex__gemini3flashpreview__2026-02-27-02-12__metrics.json",
            "timestamp": "2026-02-27T01:12:08.626686Z",
            "accuracy": 0.9791666666666666,
            "cohen_kappa": 0.9403440952585486
          }
        ]
      },
      {
        "group_id": "1247c033faa2b147",
        "representative_policy": "best_accuracy",
        "task_fingerprint": "7326c5be7cb61f6e9585d417047cb29a4d0b0ca63102404c9692e99eeae036b2",
        "normalized_tag_key": "*like*;discourse;english;pragmatics;v2",
        "task_name_display": "*like* discourse/pragm",
        "task_names_seen": [
          "*like* discourse/pragm"
        ],
        "tags_display": "*like*; discourse; pragmatics; English; v2",
        "model_count": 4,
        "alpha_nominal": 0.8315709379878387,
        "pairable_item_count": 990,
        "rated_item_count": 993,
        "fully_shared_item_count": 984,
        "category_count": 4,
        "representative_run_stems": [
          "like__vertex__gemini3flashpreview__2026-03-13-02-32",
          "like__vertex__gemini3propreview__2026-03-13-03-28",
          "like__vertex__gemini31flashlitepreview__2026-03-13-03-29",
          "like__vertex__gemini31propreview__2026-03-13-12-02"
        ],
        "representatives": [
          {
            "provider": "vertex",
            "model": "gemini-3-flash-preview",
            "run_stem": "like__vertex__gemini3flashpreview__2026-03-13-02-32",
            "metrics_file": "like__vertex__gemini3flashpreview__2026-03-13-02-32__metrics.json",
            "timestamp": "2026-03-13T01:32:26.042657Z",
            "accuracy": 0.908908908908909,
            "cohen_kappa": 0.8684070726005488
          },
          {
            "provider": "vertex",
            "model": "gemini-3-pro-preview",
            "run_stem": "like__vertex__gemini3propreview__2026-03-13-03-28",
            "metrics_file": "like__vertex__gemini3propreview__2026-03-13-03-28__metrics.json",
            "timestamp": "2026-03-13T02:29:02.578184Z",
            "accuracy": 0.9019019019019019,
            "cohen_kappa": 0.8586077115714009
          },
          {
            "provider": "vertex",
            "model": "gemini-3.1-flash-lite-preview",
            "run_stem": "like__vertex__gemini31flashlitepreview__2026-03-13-03-29",
            "metrics_file": "like__vertex__gemini31flashlitepreview__2026-03-13-03-29__metrics.json",
            "timestamp": "2026-03-13T02:29:25.545069Z",
            "accuracy": 0.7877877877877878,
            "cohen_kappa": 0.6900339840354066
          },
          {
            "provider": "vertex",
            "model": "gemini-3.1-pro-preview",
            "run_stem": "like__vertex__gemini31propreview__2026-03-13-12-02",
            "metrics_file": "like__vertex__gemini31propreview__2026-03-13-12-02__metrics.json",
            "timestamp": "2026-03-13T11:02:34.724184Z",
            "accuracy": 0.914914914914915,
            "cohen_kappa": 0.876989156966848
          }
        ]
      },
      {
        "group_id": "da35194c64831de2",
        "representative_policy": "best_accuracy",
        "task_fingerprint": "e0860bb13f8109f7854720ca263054673c978e0e327536fcf1d0e1226f2388db",
        "normalized_tag_key": "lemmatization;morphology;old english;v3",
        "task_name_display": "OE lemmatization",
        "task_names_seen": [
          "OE lemmatization"
        ],
        "tags_display": "Old English; lemmatization; morphology; v3",
        "model_count": 8,
        "alpha_nominal": 0.7831469880735121,
        "pairable_item_count": 2844,
        "rated_item_count": 2846,
        "fully_shared_item_count": 2819,
        "category_count": 1237,
        "representative_run_stems": [
          "ycoe3__einfra__deepseekv32thinking__2026-03-18-23-50",
          "ycoe3__einfra__glm47__2026-03-20-10-02",
          "ycoe3__einfra__glm5__2026-03-31-20-38",
          "ycoe3__einfra__gptoss120b__2026-03-20-15-32",
          "ycoe3__einfra__kimik25__2026-03-19-21-22",
          "ycoe3__openai__gpt54mini__2026-03-18-17-04",
          "ycoe3__requesty__anthropicclaudehaiku45__2026-03-23-01-29",
          "ycoe3__vertex__gemini3flashpreview__2026-03-18-01-11"
        ],
        "representatives": [
          {
            "provider": "e-infra",
            "model": "deepseek-v3.2-thinking",
            "run_stem": "ycoe3__einfra__deepseekv32thinking__2026-03-18-23-50",
            "metrics_file": "ycoe3__einfra__deepseekv32thinking__2026-03-18-23-50__metrics.json",
            "timestamp": "2026-03-18T22:50:33.957474Z",
            "accuracy": 0.809961144471918,
            "cohen_kappa": 0.8067742518245767
          },
          {
            "provider": "e-infra",
            "model": "glm-4.7",
            "run_stem": "ycoe3__einfra__glm47__2026-03-20-10-02",
            "metrics_file": "ycoe3__einfra__glm47__2026-03-20-10-02__metrics.json",
            "timestamp": "2026-03-20T09:02:57.831985Z",
            "accuracy": 0.8212645708230307,
            "cohen_kappa": 0.8182681322919435
          },
          {
            "provider": "e-infra",
            "model": "glm-5",
            "run_stem": "ycoe3__einfra__glm5__2026-03-31-20-38",
            "metrics_file": "ycoe3__einfra__glm5__2026-03-31-20-38__metrics.json",
            "timestamp": "2026-03-31T18:38:48.544821Z",
            "accuracy": 0.8205581066760862,
            "cohen_kappa": 0.8175208636693767
          },
          {
            "provider": "e-infra",
            "model": "gpt-oss-120b",
            "run_stem": "ycoe3__einfra__gptoss120b__2026-03-20-15-32",
            "metrics_file": "ycoe3__einfra__gptoss120b__2026-03-20-15-32__metrics.json",
            "timestamp": "2026-03-20T14:32:56.669315Z",
            "accuracy": 0.7004592016955139,
            "cohen_kappa": 0.6961615957069811
          },
          {
            "provider": "e-infra",
            "model": "kimi-k2.5",
            "run_stem": "ycoe3__einfra__kimik25__2026-03-19-21-22",
            "metrics_file": "ycoe3__einfra__kimik25__2026-03-19-21-22__metrics.json",
            "timestamp": "2026-03-19T20:22:24.560641Z",
            "accuracy": 0.8265630519251148,
            "cohen_kappa": 0.8235506107595435
          },
          {
            "provider": "openai",
            "model": "gpt-5.4-mini",
            "run_stem": "ycoe3__openai__gpt54mini__2026-03-18-17-04",
            "metrics_file": "ycoe3__openai__gpt54mini__2026-03-18-17-04__metrics.json",
            "timestamp": "2026-03-18T16:04:56.311644Z",
            "accuracy": 0.6944542564464854,
            "cohen_kappa": 0.6896664555799817
          },
          {
            "provider": "requesty",
            "model": "claude-haiku-4-5",
            "run_stem": "ycoe3__requesty__anthropicclaudehaiku45__2026-03-23-01-29",
            "metrics_file": "ycoe3__requesty__anthropicclaudehaiku45__2026-03-23-01-29__metrics.json",
            "timestamp": "2026-03-23T00:29:26.469275Z",
            "accuracy": 0.6937477922995408,
            "cohen_kappa": 0.6901560734270713
          },
          {
            "provider": "vertex",
            "model": "gemini-3-flash-preview",
            "run_stem": "ycoe3__vertex__gemini3flashpreview__2026-03-18-01-11",
            "metrics_file": "ycoe3__vertex__gemini3flashpreview__2026-03-18-01-11__metrics.json",
            "timestamp": "2026-03-18T00:11:38.185791Z",
            "accuracy": 0.8322147651006712,
            "cohen_kappa": 0.8292449559819026
          }
        ]
      },
      {
        "group_id": "411099465fb6c0e7",
        "representative_policy": "best_accuracy",
        "task_fingerprint": "9911e304116a3387591af8bb222b9d5c5983c625188825c9007ceff21f5910bc",
        "normalized_tag_key": "*like*;discourse;english;pragmatics;v3",
        "task_name_display": "*like* discourse/pragm",
        "task_names_seen": [
          "*like* discourse/pragm"
        ],
        "tags_display": "*like*; discourse; pragmatics; English; v3",
        "model_count": 2,
        "alpha_nominal": 0.7537922987164527,
        "pairable_item_count": 106,
        "rated_item_count": 111,
        "fully_shared_item_count": 106,
        "category_count": 4,
        "representative_run_stems": [
          "like_interrater__einfra__deepseekv32thinking__2026-03-13-20-58",
          "like_interrater__vertex__gemini31propreview"
        ],
        "representatives": [
          {
            "provider": "e-infra",
            "model": "deepseek-v3.2-thinking",
            "run_stem": "like_interrater__einfra__deepseekv32thinking__2026-03-13-20-58",
            "metrics_file": "like_interrater__einfra__deepseekv32thinking__2026-03-13-20-58__metrics.json",
            "timestamp": "2026-03-13T19:58:49.743838Z",
            "accuracy": 0.7130434782608696,
            "cohen_kappa": 0.6246290801186943
          },
          {
            "provider": "vertex",
            "model": "gemini-3.1-pro-preview",
            "run_stem": "like_interrater__vertex__gemini31propreview",
            "metrics_file": "like_interrater__vertex__gemini31propreview__metrics.json",
            "timestamp": "2026-03-13T17:50:24.766352Z",
            "accuracy": 0.8260869565217391,
            "cohen_kappa": 0.7721644378405151
          }
        ]
      },
      {
        "group_id": "e5971c73350ad936",
        "representative_policy": "best_accuracy",
        "task_fingerprint": "cd60855ae86dc7a8391248627a68bbc5a77fabf2d75d0b967118c09aec3d8faa",
        "normalized_tag_key": "*like*;discourse;english;pragmatics;v3",
        "task_name_display": "*like* discourse/pragm",
        "task_names_seen": [
          "*like* discourse/pragm"
        ],
        "tags_display": "*like*; discourse; pragmatics; English; v3",
        "model_count": 17,
        "alpha_nominal": 0.7249950656203237,
        "pairable_item_count": 115,
        "rated_item_count": 115,
        "fully_shared_item_count": 94,
        "category_count": 14,
        "representative_run_stems": [
          "like_interrater__einfra__deepseekv32thinking__2026-03-16-23-13",
          "like_interrater__einfra__glm47__2026-03-17-00-08",
          "like_interrater__einfra__glm5__2026-03-31-17-53",
          "like_interrater__einfra__glm51__2026-04-24-16-04",
          "like_interrater__einfra__kimik25__2026-03-19-14-20",
          "like_interrater__einfra__kimik26__2026-04-21-18-06",
          "like_interrater__einfra__qwen35__2026-03-16-23-35",
          "like_interrater__google__modelsgemma426ba4bit__2026-04-04-01-49",
          "like_interrater__openai__gpt54__2026-03-16-23-16",
          "like_interrater__openai__gpt54mini__2026-03-20-18-10",
          "like_interrater__openrouter__qwenqwen36plusfree__2026-04-03-16-35",
          "like_interrater__requesty__anthropicclaudehaiku45__2026-03-23-00-11",
          "like_interrater__requesty__anthropicclaudeopus46__2026-03-21-02-24",
          "like_interrater__requesty__nebiuszaiorgglm47__2026-03-17-15-44",
          "like_interrater__requesty__moonshotkimik25__2026-03-17-15-43",
          "like_interrater__vertex__gemini3flashpreview__2026-03-16-20-22",
          "like_interrater__vertex__gemini31propreview__2026-03-16-23-17"
        ],
        "representatives": [
          {
            "provider": "e-infra",
            "model": "deepseek-v3.2-thinking",
            "run_stem": "like_interrater__einfra__deepseekv32thinking__2026-03-16-23-13",
            "metrics_file": "like_interrater__einfra__deepseekv32thinking__2026-03-16-23-13__metrics.json",
            "timestamp": "2026-03-16T22:13:59.292606Z",
            "accuracy": 0.7565217391304347,
            "cohen_kappa": 0.6858536585365853
          },
          {
            "provider": "e-infra",
            "model": "glm-4.7",
            "run_stem": "like_interrater__einfra__glm47__2026-03-17-00-08",
            "metrics_file": "like_interrater__einfra__glm47__2026-03-17-00-08__metrics.json",
            "timestamp": "2026-03-16T23:08:13.298000Z",
            "accuracy": 0.8260869565217391,
            "cohen_kappa": 0.7717121588089332
          },
          {
            "provider": "e-infra",
            "model": "glm-5",
            "run_stem": "like_interrater__einfra__glm5__2026-03-31-17-53",
            "metrics_file": "like_interrater__einfra__glm5__2026-03-31-17-53__metrics.json",
            "timestamp": "2026-03-31T15:53:31.126778Z",
            "accuracy": 0.8869565217391304,
            "cohen_kappa": 0.8496732026143791
          },
          {
            "provider": "e-infra",
            "model": "glm-5.1",
            "run_stem": "like_interrater__einfra__glm51__2026-04-24-16-04",
            "metrics_file": "like_interrater__einfra__glm51__2026-04-24-16-04__metrics.json",
            "timestamp": "2026-04-24T14:04:41.439134Z",
            "accuracy": 0.8782608695652174,
            "cohen_kappa": 0.8386773547094188
          },
          {
            "provider": "e-infra",
            "model": "kimi-k2.5",
            "run_stem": "like_interrater__einfra__kimik25__2026-03-19-14-20",
            "metrics_file": "like_interrater__einfra__kimik25__2026-03-19-14-20__metrics.json",
            "timestamp": "2026-03-19T13:20:12.743909Z",
            "accuracy": 0.8434782608695652,
            "cohen_kappa": 0.7933100349475787
          },
          {
            "provider": "e-infra",
            "model": "kimi-k2.6",
            "run_stem": "like_interrater__einfra__kimik26__2026-04-21-18-06",
            "metrics_file": "like_interrater__einfra__kimik26__2026-04-21-18-06__metrics.json",
            "timestamp": "2026-04-25T10:26:44.480602Z",
            "accuracy": 0.808695652173913,
            "cohen_kappa": 0.7486338797814208
          },
          {
            "provider": "e-infra",
            "model": "qwen3.5",
            "run_stem": "like_interrater__einfra__qwen35__2026-03-16-23-35",
            "metrics_file": "like_interrater__einfra__qwen35__2026-03-16-23-35__metrics.json",
            "timestamp": "2026-03-16T22:35:09.822527Z",
            "accuracy": 0.8956521739130435,
            "cohen_kappa": 0.8637037037037038
          },
          {
            "provider": "google",
            "model": "gemma-4-26b-a4b-it",
            "run_stem": "like_interrater__google__modelsgemma426ba4bit__2026-04-04-01-49",
            "metrics_file": "like_interrater__google__modelsgemma426ba4bit__2026-04-04-01-49__metrics.json",
            "timestamp": "2026-04-03T23:49:56.734262Z",
            "accuracy": 0.8173913043478261,
            "cohen_kappa": 0.7577733199598797
          },
          {
            "provider": "openai",
            "model": "gpt-5.4",
            "run_stem": "like_interrater__openai__gpt54__2026-03-16-23-16",
            "metrics_file": "like_interrater__openai__gpt54__2026-03-16-23-16__metrics.json",
            "timestamp": "2026-03-16T22:16:19.942954Z",
            "accuracy": 0.6869565217391305,
            "cohen_kappa": 0.6090651558073654
          },
          {
            "provider": "openai",
            "model": "gpt-5.4-mini",
            "run_stem": "like_interrater__openai__gpt54mini__2026-03-20-18-10",
            "metrics_file": "like_interrater__openai__gpt54mini__2026-03-20-18-10__metrics.json",
            "timestamp": "2026-03-20T17:11:01.726403Z",
            "accuracy": 0.6086956521739131,
            "cohen_kappa": 0.48964497041420124
          },
          {
            "provider": "openrouter",
            "model": "qwen3.6",
            "run_stem": "like_interrater__openrouter__qwenqwen36plusfree__2026-04-03-16-35",
            "metrics_file": "like_interrater__openrouter__qwenqwen36plusfree__2026-04-03-16-35__metrics.json",
            "timestamp": "2026-04-03T14:35:02.774817Z",
            "accuracy": 0.8869565217391304,
            "cohen_kappa": 0.8521266073194856
          },
          {
            "provider": "requesty",
            "model": "claude-haiku-4-5",
            "run_stem": "like_interrater__requesty__anthropicclaudehaiku45__2026-03-23-00-11",
            "metrics_file": "like_interrater__requesty__anthropicclaudehaiku45__2026-03-23-00-11__metrics.json",
            "timestamp": "2026-03-22T23:11:59.214488Z",
            "accuracy": 0.5478260869565217,
            "cohen_kappa": 0.40909090909090906
          },
          {
            "provider": "requesty",
            "model": "claude-opus-4-6",
            "run_stem": "like_interrater__requesty__anthropicclaudeopus46__2026-03-21-02-24",
            "metrics_file": "like_interrater__requesty__anthropicclaudeopus46__2026-03-21-02-24__metrics.json",
            "timestamp": "2026-03-21T01:24:28.930846Z",
            "accuracy": 0.8173913043478261,
            "cohen_kappa": 0.7521806054386866
          },
          {
            "provider": "requesty",
            "model": "glm-4.7",
            "run_stem": "like_interrater__requesty__nebiuszaiorgglm47__2026-03-17-15-44",
            "metrics_file": "like_interrater__requesty__nebiuszaiorgglm47__2026-03-17-15-44__metrics.json",
            "timestamp": "2026-03-17T14:44:34.806095Z",
            "accuracy": 0.8,
            "cohen_kappa": 0.7374689826302729
          },
          {
            "provider": "requesty",
            "model": "kimi-k2.5",
            "run_stem": "like_interrater__requesty__moonshotkimik25__2026-03-17-15-43",
            "metrics_file": "like_interrater__requesty__moonshotkimik25__2026-03-17-15-43__metrics.json",
            "timestamp": "2026-03-17T14:43:35.190186Z",
            "accuracy": 0.8608695652173913,
            "cohen_kappa": 0.817279046673287
          },
          {
            "provider": "vertex",
            "model": "gemini-3-flash-preview",
            "run_stem": "like_interrater__vertex__gemini3flashpreview__2026-03-16-20-22",
            "metrics_file": "like_interrater__vertex__gemini3flashpreview__2026-03-16-20-22__metrics.json",
            "timestamp": "2026-03-16T19:22:29.614294Z",
            "accuracy": 0.9217391304347826,
            "cohen_kappa": 0.8961885656970913
          },
          {
            "provider": "vertex",
            "model": "gemini-3.1-pro-preview",
            "run_stem": "like_interrater__vertex__gemini31propreview__2026-03-16-23-17",
            "metrics_file": "like_interrater__vertex__gemini31propreview__2026-03-16-23-17__metrics.json",
            "timestamp": "2026-03-16T22:17:19.752371Z",
            "accuracy": 0.9130434782608695,
            "cohen_kappa": 0.8857426726279185
          }
        ]
      },
      {
        "group_id": "ea2931fcb792fdbc",
        "representative_policy": "best_accuracy",
        "task_fingerprint": "7326c5be7cb61f6e9585d417047cb29a4d0b0ca63102404c9692e99eeae036b2",
        "normalized_tag_key": "*like*;discourse;english;pragmatics;v1",
        "task_name_display": "*like* discourse/pragm",
        "task_names_seen": [
          "*like* discourse/pragm"
        ],
        "tags_display": "*like*; discourse; pragmatics; English; v1",
        "model_count": 6,
        "alpha_nominal": 0.6751601138214288,
        "pairable_item_count": 991,
        "rated_item_count": 998,
        "fully_shared_item_count": 842,
        "category_count": 5,
        "representative_run_stems": [
          "like____gptoss120b__2025-11-09-17-09",
          "like____gpt5__2025-11-09-02-03",
          "like__requesty__anthropicclaudesonnet46__2026-03-12-14-40",
          "like__requesty__novitadeepseekdeepseekv32__2026-03-12-16-52",
          "like__vertex__gemini3flashpreview__2026-03-12-01-31",
          "like__vertex__gemini31propreview__2026-03-12-18-10"
        ],
        "representatives": [
          {
            "provider": "openai",
            "model": "gpt-oss-120b",
            "run_stem": "like____gptoss120b__2025-11-09-17-09",
            "metrics_file": "like____gptoss120b__2025-11-09-17-09__metrics.json",
            "timestamp": "2025-11-09T16:09:03.246413Z",
            "accuracy": 0.6416416416416416,
            "cohen_kappa": 0.5149610698897534
          },
          {
            "provider": "openai",
            "model": "gpt5",
            "run_stem": "like____gpt5__2025-11-09-02-03",
            "metrics_file": "like____gpt5__2025-11-09-02-03__metrics.json",
            "timestamp": "2025-11-09T01:03:38.395018Z",
            "accuracy": 0.8398398398398398,
            "cohen_kappa": 0.7709513457898124
          },
          {
            "provider": "requesty",
            "model": "claude-sonnet-4-6",
            "run_stem": "like__requesty__anthropicclaudesonnet46__2026-03-12-14-40",
            "metrics_file": "like__requesty__anthropicclaudesonnet46__2026-03-12-14-40__metrics.json",
            "timestamp": "2026-03-12T13:40:58.748047Z",
            "accuracy": 0.8298298298298298,
            "cohen_kappa": 0.7574231404297621
          },
          {
            "provider": "requesty",
            "model": "deepseek-v3.2",
            "run_stem": "like__requesty__novitadeepseekdeepseekv32__2026-03-12-16-52",
            "metrics_file": "like__requesty__novitadeepseekdeepseekv32__2026-03-12-16-52__metrics.json",
            "timestamp": "2026-03-12T15:52:04.876466Z",
            "accuracy": 0.6446446446446447,
            "cohen_kappa": 0.47974932556430355
          },
          {
            "provider": "vertex",
            "model": "gemini-3-flash-preview",
            "run_stem": "like__vertex__gemini3flashpreview__2026-03-12-01-31",
            "metrics_file": "like__vertex__gemini3flashpreview__2026-03-12-01-31__metrics.json",
            "timestamp": "2026-03-12T00:31:13.838640Z",
            "accuracy": 0.8838838838838838,
            "cohen_kappa": 0.832653167320839
          },
          {
            "provider": "vertex",
            "model": "gemini-3.1-pro-preview",
            "run_stem": "like__vertex__gemini31propreview__2026-03-12-18-10",
            "metrics_file": "like__vertex__gemini31propreview__2026-03-12-18-10__metrics.json",
            "timestamp": "2026-03-12T17:10:58.353313Z",
            "accuracy": 0.8948948948948949,
            "cohen_kappa": 0.8484215728492612
          }
        ]
      },
      {
        "group_id": "fa72ac2e9e024482",
        "representative_policy": "best_accuracy",
        "task_fingerprint": "9d6ea446c173cf044401c04094f095a25fb7ceec9caef163c35be7264a0bb349",
        "normalized_tag_key": "*like*;discourse;english;pragmatics;v3",
        "task_name_display": "*like* discourse/pragm",
        "task_names_seen": [
          "*like* discourse/pragm"
        ],
        "tags_display": "*like*; discourse; pragmatics; English; v3",
        "model_count": 3,
        "alpha_nominal": 0.6518785370596594,
        "pairable_item_count": 112,
        "rated_item_count": 112,
        "fully_shared_item_count": 98,
        "category_count": 9,
        "representative_run_stems": [
          "like_interrater__openai__gpt54__2026-03-13-23-51",
          "like_interrater__requesty__anthropicclaudesonnet46__2026-03-13-22-58",
          "like_interrater__vertex__gemini3flashpreview__2026-03-13-15-32"
        ],
        "representatives": [
          {
            "provider": "openai",
            "model": "gpt-5.4",
            "run_stem": "like_interrater__openai__gpt54__2026-03-13-23-51",
            "metrics_file": "like_interrater__openai__gpt54__2026-03-13-23-51__metrics.json",
            "timestamp": "2026-03-13T22:51:21.706190Z",
            "accuracy": 0.5217391304347826,
            "cohen_kappa": 0.4387755102040816
          },
          {
            "provider": "requesty",
            "model": "claude-sonnet-4-6",
            "run_stem": "like_interrater__requesty__anthropicclaudesonnet46__2026-03-13-22-58",
            "metrics_file": "like_interrater__requesty__anthropicclaudesonnet46__2026-03-13-22-58__metrics.json",
            "timestamp": "2026-03-13T21:58:23.211617Z",
            "accuracy": 0.7478260869565218,
            "cohen_kappa": 0.6706172839506173
          },
          {
            "provider": "vertex",
            "model": "gemini-3-flash-preview",
            "run_stem": "like_interrater__vertex__gemini3flashpreview__2026-03-13-15-32",
            "metrics_file": "like_interrater__vertex__gemini3flashpreview__2026-03-13-15-32__metrics.json",
            "timestamp": "2026-03-13T14:32:09.952389Z",
            "accuracy": 0.808695652173913,
            "cohen_kappa": 0.7482587064676617
          }
        ]
      }
    ]
  }
}