Status
β complete
Domain
metr.org
Archived
2026-04-10 22:49:09
Original URL
Plaintext Content
(48.8 KB)
.menu-main-mobile { visibility: hidden; } Research Notes Updates About Donate Careers Search .logo-slideable-text { opacity: 0; } Research Notes Updates About Donate Careers Menu .hide-if-compact { display: none; } Model Evaluation & Threat Research METR conducts research and evaluations to improve public understanding of the capabilities and risks of frontier AI systems. Our research Careers Weβve worked with .chart-container { width: 100%; height: fit-content; position: relative; padding: 0 0 0 20px; @media (max-width: 635px) { height: 420px; overflow-y: clip; } } .embed-chart-container .chart-container { height: 520px !important; } .axis { font-size: 14px; } .grid .domain { display: none; } .axis.x-axis g:first-of-type line { display: none; } .axis-label { font-size: 16px; font-weight: 500; } .grid line { stroke: #f0f0f0; stroke-opacity: 0.9; stroke-dasharray: 4, 2; } .dot { stroke-width: 2; stroke: #fff; cursor: pointer; } .dot:hover { r: 8; } .dot.frontier { fill: #2e7d32; } .dot.non-frontier { fill: #9e9e9e; } /* Split dot styles for overlapping models */ .dot-half { stroke-width: 0; cursor: pointer; transition: opacity 0.15s; } .dot-half.frontier { fill: #2e7d32; } .dot-half.non-frontier { fill: #9e9e9e; } .dot-half:hover { opacity: 0.8; } .split-dot-group:hover .split-dot-divider { stroke-width: 2; } .tooltip { position: absolute; text-align: left; padding: 12px; font-size: 14px; font-family: Montserrat, sans-serif; background: rgba(0, 0, 0, 0.9); color: white; border-radius: 4px; opacity: 0; transition: opacity 0.2s; a { color: var(--color-primary); text-decoration: underline; filter: invert(1) hue-rotate(180deg); &:hover { filter: invert(1) hue-rotate(180deg) brightness(1.2); } } } .tooltip h4 { margin: 0 0 5px 0; font-size: 16px; } .tooltip p { margin: 0; font-size: 13px; } .legend-dot { display: inline-block; width: 12px; height: 12px; border-radius: 50%; vertical-align: middle; margin-right: 5px; } .legend-dot.frontier { background-color: #2e7d32; } .legend-dot.non-frontier { background-color: #9e9e9e; } .chart-controls { display: flex; flex-direction: row; flex-wrap: nowrap; align-items: center; justify-content: space-between; margin: 1rem auto 20px; position: relative; padding-inline: 32px; .btn-group { padding: 0.5rem; background: none; border: none; @container (max-width: 500px) { padding: 0; gap: 0.1rem; } button { color: #999; border-radius: 0.5rem !important; border-right: none !important; &:hover { color: var(--text-dark); background-color: var(--almost-white) !important; } &.active { color: var(--text-dark); background-color: var(--gray-background); } } } .download-btn { border-radius: 0.5rem !important; border: none !important; font-size: 16px; @container (max-width: 420px) { position: absolute; bottom: 2rem; right: 0.25rem; } &:hover { background-color: var(--gray-background); } } } .main-right-column .chart-controls { margin-top: -0.25rem; } .btn-group { display: inline-flex; border-radius: 6px; overflow: hidden; background-color: var(--gray-background); } .btn-group.probability-toggle, .btn-group.scale-toggle { overflow: visible; } .btn-group button { background-color: transparent; border: none; color: #666; padding: 8px 14px; cursor: pointer; transition: all 0.3s ease; font-size: 14px; font-weight: 500; line-height: 1.2; margin: 0; position: relative; @container (max-width: 768px) { padding: 6px 12px; font-size: 13px; } } .btn-group button:not(:last-child) { border-right: 1px solid #e0e0e0; } .btn-group button:hover:not(.active) { background-color: rgba(255,255,255,0.5); color: #333; } .btn-group button.active { background-color: var(--color-primary); color: white; } /* Box-shadow glow on toggle group during auto-switch */ .scale-toggle { transition: box-shadow 0.5s ease-in-out; } .scale-toggle.effect-glow { box-shadow: 0 0 12px rgba(88,152,133,0.6); } /* Custom tooltip for probability toggle */ .probability-tooltip { position: fixed; transform: translate(-50%, -100%); background-color: rgba(0, 0, 0, 0.9); color: white; padding: 12px 16px; border-radius: 6px; font-size: 13px; white-space: normal; width: 300px; text-align: center; opacity: 0; visibility: hidden; transition: opacity 0.3s, visibility 0.3s; pointer-events: none; z-index: 10000; line-height: 1.5; box-shadow: 0 4px 8px rgba(0,0,0,0.2); margin-bottom: 10px; } .probability-tooltip::after { content: ''; position: absolute; top: 100%; left: var(--caret-offset, 50%); transform: translateX(-50%); border: 8px solid transparent; border-top-color: rgba(0, 0, 0, 0.9); } .probability-tooltip.visible { opacity: 1; visibility: visible; } /* Version dropdown styles */ .version-dropdown { position: relative; display: inline-flex; align-items: center; } .version-dropdown-button { display: inline-flex; align-items: center; gap: 6px; background-color: var(--gray-background); border: none; border-radius: 6px; padding: 8px 14px; cursor: pointer; font-family: inherit; font-size: 14px; font-weight: 500; line-height: 1.2; color: #666; transition: all 0.3s ease; outline: none; white-space: nowrap; @container (max-width: 768px) { padding: 6px 12px; font-size: 13px; } } .version-dropdown-button:hover { background-color: #e8e8e8; color: #333; } .version-dropdown-button:focus { outline: none; } .version-dropdown-button svg { width: 16px; height: 16px; transition: transform 0.2s; } .version-dropdown.open .version-dropdown-button svg { transform: rotate(180deg); } .version-dropdown-menu { position: absolute; top: calc(100% + 4px); left: 0; background: white; border-radius: 8px; box-shadow: 0 4px 20px rgba(0, 0, 0, 0.15); min-width: 340px; z-index: 1000; opacity: 0; visibility: hidden; transform: translateY(-8px); transition: opacity 0.2s, visibility 0.2s, transform 0.2s; } .version-dropdown.open .version-dropdown-menu { opacity: 1; visibility: visible; transform: translateY(0); } /* Dropdown opens above controls (for embeds) */ .version-dropdown.dropdown-above .version-dropdown-menu { top: auto; bottom: calc(100% + 4px); transform: translateY(8px); } .version-dropdown.dropdown-above.open .version-dropdown-menu { transform: translateY(0); } .version-dropdown-item { padding: 12px 16px; cursor: pointer; border-bottom: 1px solid #f0f0f0; } .version-dropdown-item:last-child { border-bottom: none; } .version-dropdown-item:hover { background-color: #f8f8f8; } .version-dropdown-item:first-child { border-radius: 8px 8px 0 0; } .version-dropdown-item:last-child { border-radius: 0 0 8px 8px; } .version-dropdown-item.active { background-color: var(--color-primary); color: white; } .version-dropdown-item.active .version-item-description { color: rgba(255, 255, 255, 0.85); } .version-item-title { font-weight: 600; font-size: 14px; margin-bottom: 4px; } .version-item-description { font-size: 12px; color: #666; line-height: 1.4; } .version-item-description a { color: inherit; text-decoration: underline; } .version-dropdown-item.active .version-item-description a { color: rgba(255, 255, 255, 0.9); } .chart-caption { margin-top: 20px; padding: 0 20px; font-size: 14px; line-height: 1.6; color: var(--text-gray); padding-top: 1rem; border-top: 1px var(--separator-line-color) solid; text-align: end; } .text-right { text-align: right; } .time-horizon-chart-container { padding-top: 0; padding-bottom: 0; container-type: inline-size; } /* When full-bleed is applied, allow the container to expand */ .time-horizon-chart-container.full-bleed { max-width: none; } /* The inner container provides the max-width constraint */ .time-horizon-chart-container .container { max-width: 1200px; } .model-label-bg { fill: white; opacity: 0.75; } @container (max-width: 768px) { .chart-controls { @supports (-moz-appearance: none) { @media (hover: none) and (pointer: coarse) { max-width: calc(100vw - 20px); // Fix Firefox Android overflow } } } .btn-group { font-size: 13px; } .btn-group button, .version-dropdown-button { padding: 4px 8px; font-size: 13px; } } .shown-when-narrow { display: none; } @container (max-width: 800px) { .chart-controls { margin-left: -0.5rem; } .version-dropdown-button { gap: 2px; } .btn-group { gap: 0; } .btn-group button, .version-dropdown-button { font-size: 12px; } .download-btn { margin-left: -0.5rem; } } @container (max-width: 560px) { .hidden-when-narrow { display: none; } .shown-when-narrow { display: inline; } } @media (max-width: 600px) { .version-dropdown-button { gap: 2px; } .btn-group button, .version-dropdown-button { font-size: 12px; } } @media (max-width: 520px) { .hidden-when-narrow { display: none; } .shown-when-narrow { display: inline; } } Time Horizon 1.1 (Current) TH 1.1 Time Horizon 1.1 (Current) Follows the same methodology described in the initial paper, but with a larger task suite. See release announcement. Time Horizon 1.0 (Mar 2025) Original time horizon computations. Calculated for models from 2019 through Nov 2025, following the methods described in the original time horizon paper. Log Scale Linear Scale 50% Success 80% Success const benchmarkDataV1 = {"benchmark_name":"METR-Horizon-v1.0","doubling_time_in_days":{"all_time":{"ci_high":225.353,"ci_low":176.208,"point_estimate":201.15},"from_2023_on":{"ci_high":229.176,"ci_low":138.411,"point_estimate":175.603}},"long_tasks_version":"2ce7f1e0c4f8b7f2653e7014941a1a9f3ca908e2","results":{"claude_3_5_sonnet":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.481203},"is_sota":true,"p50_horizon_length":{"ci_high":35.808873,"ci_low":10.531997,"estimate":19.764601},"p80_horizon_length":{"ci_high":10.000262,"ci_low":1.818235,"estimate":4.192071}},"release_date":"2024-06-20","scaffolds":["modular-public",null]},"claude_3_5_sonnet_20241022":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.530535},"is_sota":true,"p50_horizon_length":{"ci_high":60.891717,"ci_low":14.72191,"estimate":30.454555},"p80_horizon_length":{"ci_high":15.195336,"ci_low":2.249329,"estimate":5.907631}},"release_date":"2024-10-22","scaffolds":["modular-public",null]},"claude_3_7_sonnet":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.605035},"is_sota":true,"p50_horizon_length":{"ci_high":89.187313,"ci_low":28.87097,"estimate":55.258445},"p80_horizon_length":{"ci_high":43.669897,"ci_low":7.749991,"estimate":20.378794}},"release_date":"2025-02-24","scaffolds":["flock-public",null]},"claude_3_opus":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.377459},"is_sota":false,"p50_horizon_length":{"ci_high":14.764711,"ci_low":3.217769,"estimate":7.148402},"p80_horizon_length":{"ci_high":4.612595,"ci_low":0.585264,"estimate":1.578216}},"release_date":"2024-03-04","scaffolds":["modular-public",null]},"claude_4_1_opus":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.677469},"is_sota":true,"p50_horizon_length":{"ci_high":200.80836,"ci_low":55.954932,"estimate":109.263038},"p80_horizon_length":{"ci_high":66.554968,"ci_low":8.357108,"estimate":27.289864}},"release_date":"2025-08-05","scaffolds":["flock-public",null]},"claude_4_opus":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.648693},"is_sota":false,"p50_horizon_length":{"ci_high":140.617499,"ci_low":44.934856,"estimate":83.567793},"p80_horizon_length":{"ci_high":61.850971,"ci_low":9.151599,"estimate":26.88671}},"release_date":"2025-05-22","scaffolds":["flock-public",null]},"claude_4_sonnet":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.632651},"is_sota":false,"p50_horizon_length":{"ci_high":127.091712,"ci_low":38.322944,"estimate":73.660733},"p80_horizon_length":{"ci_high":49.861238,"ci_low":7.941505,"estimate":21.948674}},"release_date":"2025-05-22","scaffolds":["flock-public",null]},"claude_opus_4_5":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.753054},"is_sota":true,"p50_horizon_length":{"ci_high":821.560215,"ci_low":104.265659,"estimate":246.781803},"p80_horizon_length":{"ci_high":114.200812,"ci_low":9.313074,"estimate":36.039286}},"release_date":"2025-11-24","scaffolds":["flock-public","metr_tasks_swaa/init_solver,generate"]},"claude_sonnet_4_5":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.683012},"is_sota":false,"p50_horizon_length":{"ci_high":225.752849,"ci_low":58.936193,"estimate":116.528413},"p80_horizon_length":{"ci_high":73.824859,"ci_low":7.513898,"estimate":26.095319}},"release_date":"2025-09-29","scaffolds":["flock-public",null]},"davinci_002":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.161869},"is_sota":true,"p50_horizon_length":{"ci_high":0.221209,"ci_low":0.093319,"estimate":0.144057},"p80_horizon_length":{"ci_high":0.105813,"ci_low":0.034813,"estimate":0.056238}},"release_date":"2020-05-28","scaffolds":["modular-public",null]},"deepseek_r1":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.520133},"is_sota":false,"p50_horizon_length":{"ci_high":52.750504,"ci_low":14.422102,"estimate":27.887616},"p80_horizon_length":{"ci_high":13.373897,"ci_low":2.344816,"estimate":5.629461}},"release_date":"2025-01-20","scaffolds":["flock-public",null]},"deepseek_r1_0528":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.542232},"is_sota":false,"p50_horizon_length":{"ci_high":64.081849,"ci_low":13.756178,"estimate":32.764282},"p80_horizon_length":{"ci_high":15.485703,"ci_low":1.353217,"estimate":4.671541}},"release_date":"2025-05-28","scaffolds":["flock-public",null]},"deepseek_v3":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.474508},"is_sota":false,"p50_horizon_length":{"ci_high":35.001162,"ci_low":10.37305,"estimate":19.822812},"p80_horizon_length":{"ci_high":13.145861,"ci_low":2.208963,"estimate":5.424593}},"release_date":"2024-12-26","scaffolds":["flock-public",null]},"deepseek_v3_0324":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.497576},"is_sota":false,"p50_horizon_length":{"ci_high":43.32637,"ci_low":12.569662,"estimate":24.594348},"p80_horizon_length":{"ci_high":15.876695,"ci_low":3.283517,"estimate":7.452729}},"release_date":"2025-03-24","scaffolds":["flock-public",null]},"gemini_2_5_pro_preview":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.557925},"is_sota":false,"p50_horizon_length":{"ci_high":69.856777,"ci_low":20.357511,"estimate":40.182282},"p80_horizon_length":{"ci_high":27.288387,"ci_low":4.659581,"estimate":12.565677}},"release_date":"2025-06-05","scaffolds":["flock-public",null]},"gpt-oss-120b":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.574566},"is_sota":false,"p50_horizon_length":{"ci_high":85.537034,"ci_low":19.901035,"estimate":45.110184},"p80_horizon_length":{"ci_high":21.625724,"ci_low":2.416079,"estimate":8.35222}},"release_date":"2025-08-05","scaffolds":["flock-public",null]},"gpt2":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.101046},"is_sota":true,"p50_horizon_length":{"ci_high":0.141825,"ci_low":0.009994,"estimate":0.053778},"p80_horizon_length":{"ci_high":0.068612,"ci_low":0.001589,"estimate":0.0128}},"release_date":"2019-02-14","scaffolds":[null]},"gpt_3_5_turbo_instruct":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.214611},"is_sota":true,"p50_horizon_length":{"ci_high":1.115905,"ci_low":0.255148,"estimate":0.599247},"p80_horizon_length":{"ci_high":0.492933,"ci_low":0.141649,"estimate":0.25538}},"release_date":"2022-03-15","scaffolds":["modular-public",null]},"gpt_4":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.361067},"is_sota":true,"p50_horizon_length":{"ci_high":11.721889,"ci_low":2.768122,"estimate":6.024114},"p80_horizon_length":{"ci_high":3.856515,"ci_low":0.495441,"estimate":1.367526}},"release_date":"2023-03-14","scaffolds":["modular-public",null]},"gpt_4_0125":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.351726},"is_sota":false,"p50_horizon_length":{"ci_high":11.553041,"ci_low":3.267886,"estimate":6.346859},"p80_horizon_length":{"ci_high":4.699039,"ci_low":0.854185,"estimate":1.913986}},"release_date":"2024-01-25","scaffolds":["modular-public",null]},"gpt_4_1106":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.404329},"is_sota":true,"p50_horizon_length":{"ci_high":18.162454,"ci_low":4.468489,"estimate":9.394536},"p80_horizon_length":{"ci_high":5.666518,"ci_low":0.73874,"estimate":2.01019}},"release_date":"2023-11-06","scaffolds":["modular-public",null]},"gpt_4_turbo":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.367357},"is_sota":false,"p50_horizon_length":{"ci_high":14.21142,"ci_low":4.050277,"estimate":7.777455},"p80_horizon_length":{"ci_high":6.034151,"ci_low":1.084966,"estimate":2.492662}},"release_date":"2024-04-09","scaffolds":["modular-public",null]},"gpt_4o":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.408517},"is_sota":true,"p50_horizon_length":{"ci_high":19.508766,"ci_low":5.000948,"estimate":10.174857},"p80_horizon_length":{"ci_high":5.947962,"ci_low":0.947346,"estimate":2.374364}},"release_date":"2024-05-13","scaffolds":["modular-public",null]},"gpt_5":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.698403},"is_sota":true,"p50_horizon_length":{"ci_high":247.820152,"ci_low":67.703931,"estimate":130.816528},"p80_horizon_length":{"ci_high":92.692214,"ci_low":9.79551,"estimate":32.984347}},"release_date":"2025-08-07","scaffolds":["flock-secret",null]},"gpt_5_1_codex_max":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.721134},"is_sota":true,"p50_horizon_length":{"ci_high":336.504695,"ci_low":80.193214,"estimate":162.253699},"p80_horizon_length":{"ci_high":129.233087,"ci_low":10.315095,"estimate":40.039035}},"release_date":"2025-11-19","scaffolds":["flock-camel","metr_tasks_swaa/init_solver,generate"]},"grok_4":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.667187},"is_sota":true,"p50_horizon_length":{"ci_high":208.297605,"ci_low":47.555449,"estimate":104.84751},"p80_horizon_length":{"ci_high":55.346662,"ci_low":4.757614,"estimate":18.59131}},"release_date":"2025-07-09","scaffolds":["flock-public",null]},"kimi_k2_thinking":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.60027},"is_sota":false,"p50_horizon_length":{"ci_high":100.591631,"ci_low":27.423314,"estimate":57.621724},"p80_horizon_length":{"ci_high":38.984235,"ci_low":4.37601,"estimate":15.004943}},"release_date":"2025-11-06","scaffolds":["flock-public","metr_tasks_swaa/init_solver,generate"]},"o1_elicited":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.56549},"is_sota":true,"p50_horizon_length":{"ci_high":85.806317,"ci_low":19.822684,"estimate":41.551706},"p80_horizon_length":{"ci_high":21.983284,"ci_low":2.557853,"estimate":7.645697}},"release_date":"2024-12-05","scaffolds":["flock",null]},"o1_preview":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.492955},"is_sota":true,"p50_horizon_length":{"ci_high":39.370195,"ci_low":12.640748,"estimate":23.254728},"p80_horizon_length":{"ci_high":13.27718,"ci_low":2.800499,"estimate":6.418505}},"release_date":"2024-09-12","scaffolds":["duet",null]},"o3":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.658782},"is_sota":true,"p50_horizon_length":{"ci_high":158.036682,"ci_low":46.571206,"estimate":91.443799},"p80_horizon_length":{"ci_high":67.256198,"ci_low":9.243647,"estimate":27.30067}},"release_date":"2025-04-16","scaffolds":["flock-secret",null]},"o4-mini":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.642837},"is_sota":false,"p50_horizon_length":{"ci_high":142.123876,"ci_low":37.053517,"estimate":76.748308},"p80_horizon_length":{"ci_high":52.779547,"ci_low":6.654501,"estimate":20.147776}},"release_date":"2025-04-16","scaffolds":["flock-secret",null]},"qwen_2_5_72b":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.357771},"is_sota":false,"p50_horizon_length":{"ci_high":12.146289,"ci_low":2.563669,"estimate":5.813798},"p80_horizon_length":{"ci_high":3.916909,"ci_low":0.46914,"estimate":1.325385}},"release_date":"2024-09-19","scaffolds":["flock-public",null]},"qwen_2_72b":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.298999},"is_sota":false,"p50_horizon_length":{"ci_high":5.937061,"ci_low":0.945777,"estimate":2.536634},"p80_horizon_length":{"ci_high":1.983832,"ci_low":0.185057,"estimate":0.595157}},"release_date":"2024-06-07","scaffolds":["flock-public",null]}},"swaa_version":"3d2ab4f0662a752409858a73e006af35e3fb7d64"}; const benchmarkDataV1_1 = {"benchmark_name":"METR-Horizon-v1.1","doubling_time_in_days":{"all_time_stitched":{"point_estimate":187.778},"from_2023_on":{"ci_high":158.012,"ci_low":104.428,"point_estimate":128.744}},"long_tasks_version":"799cc9c4b4483a93fc3445623a49ea1bd74fdeb2","results":{"claude_3_5_sonnet_20240620_inspect":{"benchmark_name":"METR-Horizon-v1.1","metrics":{"average_score":{"estimate":0.401502},"is_sota":true,"p50_horizon_length":{"ci_high":22.384214,"ci_low":5.489734,"estimate":11.395377},"p80_horizon_length":{"ci_high":4.573519,"ci_low":0.57927,"estimate":1.671757}},"release_date":"2024-06-20","scaffolds":["mtb/start_metr_task,metr_agents/react","ai_rd_fix_embedding/init_solver,use_tools,metr_agents/react","re_bench_common/init_solver,use_tools,metr_agents/react","metr_tasks_swaa/init_solver,generate","metr_agents/react"]},"claude_3_5_sonnet_20241022_inspect":{"benchmark_name":"METR-Horizon-v1.1","metrics":{"average_score":{"estimate":0.452356},"is_sota":true,"p50_horizon_length":{"ci_high":40.82028,"ci_low":10.144026,"estimate":20.522872},"p80_horizon_length":{"ci_high":7.363864,"ci_low":0.89379,"estimate":2.595677}},"release_date":"2024-10-22","scaffolds":["mtb/start_metr_task,metr_agents/react","ai_rd_fix_embedding/init_solver,use_tools,metr_agents/react","re_bench_common/init_solver,use_tools,metr_agents/react","metr_tasks_swaa/init_solver,generate","metr_agents/react"]},"claude_3_7_sonnet_inspect":{"benchmark_name":"METR-Horizon-v1.1","metrics":{"average_score":{"estimate":0.558217},"is_sota":true,"p50_horizon_length":{"ci_high":104.226017,"ci_low":33.006168,"estimate":60.388937},"p80_horizon_length":{"ci_high":28.889879,"ci_low":4.575608,"estimate":12.09179}},"release_date":"2025-02-24","scaffolds":["mtb/start_metr_task,metr_agents/react","ai_rd_fix_embedding/init_solver,use_tools,metr_agents/react","re_bench_common/init_solver,use_tools,metr_agents/react","metr_tasks_swaa/init_solver,generate","metr_agents/react"]},"claude_3_opus_inspect":{"benchmark_name":"METR-Horizon-v1.1","metrics":{"average_score":{"estimate":0.294753},"is_sota":false,"p50_horizon_length":{"ci_high":8.76484,"ci_low":1.706313,"estimate":3.952262},"p80_horizon_length":{"ci_high":2.118276,"ci_low":0.190491,"estimate":0.638973}},"release_date":"2024-03-04","scaffolds":["mtb/start_metr_task,metr_agents/react","ai_rd_fix_embedding/init_solver,use_tools,metr_agents/react","re_bench_common/init_solver,use_tools,metr_agents/react","metr_tasks_swaa/init_solver,generate","metr_agents/react"]},"claude_4_1_opus_inspect":{"benchmark_name":"METR-Horizon-v1.1","metrics":{"average_score":{"estimate":0.615713},"is_sota":false,"p50_horizon_length":{"ci_high":159.451918,"ci_low":59.272391,"estimate":100.472004},"p80_horizon_length":{"ci_high":47.785951,"ci_low":10.538456,"estimate":23.455761}},"release_date":"2025-08-05","scaffolds":["mtb/start_metr_task,metr_agents/react","ai_rd_fix_embedding/init_solver,use_tools,metr_agents/react","re_bench_common/init_solver,use_tools,metr_agents/react","metr_tasks_swaa/init_solver,generate","metr_agents/react"]},"claude_4_opus_inspect":{"benchmark_name":"METR-Horizon-v1.1","metrics":{"average_score":{"estimate":0.615452},"is_sota":false,"p50_horizon_length":{"ci_high":163.454074,"ci_low":59.994073,"estimate":100.366123},"p80_horizon_length":{"ci_high":41.937268,"ci_low":8.483125,"estimate":20.429752}},"release_date":"2025-05-22","scaffolds":["mtb/start_metr_task,metr_agents/react","ai_rd_fix_embedding/init_solver,use_tools,metr_agents/react","re_bench_common/init_solver,use_tools,metr_agents/react","metr_tasks_swaa/init_solver,generate","metr_agents/react"]},"claude_opus_4_5_inspect":{"benchmark_name":"METR-Horizon-v1.1","metrics":{"average_score":{"estimate":0.73007},"is_sota":true,"p50_horizon_length":{"ci_high":623.704698,"ci_low":161.717714,"estimate":292.994594},"p80_horizon_length":{"ci_high":104.641318,"ci_low":20.660485,"estimate":49.430584}},"release_date":"2025-11-24","scaffolds":["mtb/start_metr_task,metr_agents/react","ai_rd_fix_embedding/init_solver,use_tools,metr_agents/react","re_bench_common/init_solver,use_tools,metr_agents/react","metr_tasks_swaa/init_solver,generate","metr_agents/react"]},"claude_opus_4_6_inspect":{"benchmark_name":"METR-Horizon-v1.1","metrics":{"average_score":{"estimate":0.788644},"is_sota":true,"p50_horizon_length":{"ci_high":3633.786163,"ci_low":316.685725,"estimate":718.80683},"p80_horizon_length":{"ci_high":170.437873,"ci_low":27.026521,"estimate":69.874587}},"release_date":"2026-02-05","scaffolds":["mtb/start_metr_task,metr_agents/react","ai_rd_fix_embedding/init_solver,use_tools,metr_agents/react","re_bench_common/init_solver,use_tools,metr_agents/react","metr_tasks_swaa/init_solver,generate","metr_agents/react"]},"davinci_002":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.161869},"is_sota":true,"p50_horizon_length":{"ci_high":0.221209,"ci_low":0.093319,"estimate":0.144057},"p80_horizon_length":{"ci_high":0.105813,"ci_low":0.034813,"estimate":0.056238}},"release_date":"2020-05-28","scaffolds":["modular-public",null]},"gpt_5_3_codex":{"benchmark_name":"METR-Horizon-v1.1","metrics":{"average_score":{"estimate":0.745439},"is_sota":false,"p50_horizon_length":{"ci_high":816.351994,"ci_low":194.913134,"estimate":349.530732},"p80_horizon_length":{"ci_high":122.058963,"ci_low":22.361504,"estimate":54.739407}},"release_date":"2026-02-05","scaffolds":["mtb/start_metr_task,triframe_inspect/triframe_agent","ai_rd_fix_embedding/init_solver,use_tools,triframe_inspect/triframe_agent","re_bench_common/init_solver,use_tools,triframe_inspect/triframe_agent","metr_tasks_swaa/init_solver,generate","triframe_inspect/triframe_agent"]},"gemini_3_pro":{"benchmark_name":"METR-Horizon-v1.1","metrics":{"average_score":{"estimate":0.709822},"is_sota":true,"p50_horizon_length":{"ci_high":379.235434,"ci_low":139.565157,"estimate":224.325884},"p80_horizon_length":{"ci_high":103.295501,"ci_low":25.650451,"estimate":54.142849}},"release_date":"2025-11-18","scaffolds":["mtb/start_metr_task,metr_agents/react","ai_rd_fix_embedding/init_solver,use_tools,metr_agents/react","re_bench_common/init_solver,use_tools,metr_agents/react","metr_tasks_swaa/init_solver,generate","metr_agents/react"]},"gpt2":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.101046},"is_sota":true,"p50_horizon_length":{"ci_high":0.141825,"ci_low":0.009994,"estimate":0.053778},"p80_horizon_length":{"ci_high":0.068612,"ci_low":0.001589,"estimate":0.0128}},"release_date":"2019-02-14","scaffolds":[null]},"gpt_3_5_turbo_instruct":{"benchmark_name":"METR-Horizon-v1.0","metrics":{"average_score":{"estimate":0.214611},"is_sota":true,"p50_horizon_length":{"ci_high":1.115905,"ci_low":0.255148,"estimate":0.599247},"p80_horizon_length":{"ci_high":0.492933,"ci_low":0.141649,"estimate":0.25538}},"release_date":"2022-03-15","scaffolds":["modular-public",null]},"gpt_4":{"benchmark_name":"METR-Horizon-v1.1","metrics":{"average_score":{"estimate":0.293044},"is_sota":true,"p50_horizon_length":{"ci_high":7.995283,"ci_low":1.93292,"estimate":3.987428},"p80_horizon_length":{"ci_high":2.523746,"ci_low":0.342984,"estimate":0.889561}},"release_date":"2023-03-14","scaffolds":["modular-public",null]},"gpt_4_1106_inspect":{"benchmark_name":"METR-Horizon-v1.1","metrics":{"average_score":{"estimate":0.28905},"is_sota":true,"p50_horizon_length":{"ci_high":8.443226,"ci_low":1.866859,"estimate":4.044959},"p80_horizon_length":{"ci_high":2.358414,"ci_low":0.276599,"estimate":0.783032}},"release_date":"2023-11-06","scaffolds":["mtb/start_metr_task,metr_agents/react","ai_rd_fix_embedding/init_solver,use_tools,metr_agents/react","re_bench_common/init_solver,use_tools,metr_agents/react","metr_tasks_swaa/init_solver,generate","metr_agents/react"]},"gpt_4_turbo_inspect":{"benchmark_name":"METR-Horizon-v1.1","metrics":{"average_score":{"estimate":0.271786},"is_sota":false,"p50_horizon_length":{"ci_high":6.736613,"ci_low":1.980046,"estimate":3.732787},"p80_horizon_length":{"ci_high":2.196806,"ci_low":0.428277,"estimate":0.927933}},"release_date":"2024-04-09","scaffolds":["mtb/start_metr_task,metr_agents/react","ai_rd_fix_embedding/init_solver,use_tools,metr_agents/react","re_bench_common/init_solver,use_tools,metr_agents/react","metr_tasks_swaa/init_solver,generate","metr_agents/react"]},"gpt_4o_inspect":{"benchmark_name":"METR-Horizon-v1.1","metrics":{"average_score":{"estimate":0.338424},"is_sota":true,"p50_horizon_length":{"ci_high":12.905741,"ci_low":4.001482,"estimate":6.991195},"p80_horizon_length":{"ci_high":3.01517,"ci_low":0.562997,"estimate":1.267009}},"release_date":"2024-05-13","scaffolds":["mtb/start_metr_task,metr_agents/react","ai_rd_fix_embedding/init_solver,use_tools,metr_agents/react","re_bench_common/init_solver,use_tools,metr_agents/react","metr_tasks_swaa/init_solver,generate","metr_agents/react"]},"gpt_5_1_codex_max_inspect":{"benchmark_name":"METR-Horizon-v1.1","metrics":{"average_score":{"estimate":0.708301},"is_sota":false,"p50_horizon_length":{"ci_high":396.206723,"ci_low":134.31077,"estimate":223.714694},"p80_horizon_length":{"ci_high":88.365435,"ci_low":27.30204,"estimate":50.632499}},"release_date":"2025-11-19","scaffolds":["mtb/start_metr_task,triframe_inspect/triframe_agent","ai_rd_fix_embedding/init_solver,use_tools,triframe_inspect/triframe_agent","re_bench_common/init_solver,use_tools,triframe_inspect/triframe_agent","metr_tasks_swaa/init_solver,generate","triframe_inspect/triframe_agent"]},"gpt_5_2":{"benchmark_name":"METR-Horizon-v1.1","metrics":{"average_score":{"estimate":0.752875},"is_sota":true,"p50_horizon_length":{"ci_high":815.177445,"ci_low":198.067494,"estimate":352.249302},"p80_horizon_length":{"ci_high":131.730719,"ci_low":31.665111,"estimate":66.002649}},"release_date":"2025-12-11","scaffolds":["mtb/start_metr_task,triframe_inspect/triframe_agent","ai_rd_fix_embedding/init_solver,use_tools,triframe_inspect/triframe_agent","re_bench_common/init_solver,use_tools,triframe_inspect/triframe_agent","metr_tasks_swaa/init_solver,generate","triframe_inspect/triframe_agent"]},"gpt_5_2025_08_07_inspect":{"benchmark_name":"METR-Horizon-v1.1","metrics":{"average_score":{"estimate":0.693697},"is_sota":true,"p50_horizon_length":{"ci_high":405.551565,"ci_low":112.641357,"estimate":203.012577},"p80_horizon_length":{"ci_high":70.093125,"ci_low":19.184874,"estimate":38.312431}},"release_date":"2025-08-07","scaffolds":["mtb/start_metr_task,triframe_inspect/triframe_agent","ai_rd_fix_embedding/init_solver,use_tools,triframe_inspect/triframe_agent","re_bench_common/init_solver,use_tools,triframe_inspect/triframe_agent","metr_tasks_swaa/init_solver,generate","triframe_inspect/triframe_agent"]},"gpt_5_4":{"benchmark_name":"METR-Horizon-v1.1","metrics":{"average_score":{"estimate":0.743443},"is_sota":false,"p50_horizon_length":{"ci_high":768.779526,"ci_low":186.581591,"estimate":341.735276},"p80_horizon_length":{"ci_high":108.679232,"ci_low":23.957027,"estimate":53.877851}},"release_date":"2026-03-05","scaffolds":["mtb/start_metr_task,metr_agents/react","ai_rd_fix_embedding/init_solver,use_tools,metr_agents/react","re_bench_common/init_solver,use_tools,metr_agents/react","metr_tasks_swaa/init_solver,generate","metr_agents/react"]},"o1_inspect":{"benchmark_name":"METR-Horizon-v1.1","metrics":{"average_score":{"estimate":0.510512},"is_sota":true,"p50_horizon_length":{"ci_high":64.95249,"ci_low":21.164512,"estimate":38.831588},"p80_horizon_length":{"ci_high":16.640636,"ci_low":3.036033,"estimate":7.090121}},"release_date":"2024-12-05","scaffolds":["mtb/start_metr_task,triframe_inspect/triframe_agent","ai_rd_fix_embedding/init_solver,use_tools,triframe_inspect/triframe_agent","re_bench_common/init_solver,use_tools,triframe_inspect/triframe_agent","metr_tasks_swaa/init_solver,generate","triframe_inspect/triframe_agent"]},"o1_preview":{"benchmark_name":"METR-Horizon-v1.1","metrics":{"average_score":{"estimate":0.451441},"is_sota":true,"p50_horizon_length":{"ci_high":33.379877,"ci_low":11.716193,"estimate":20.326586},"p80_horizon_length":{"ci_high":8.912444,"ci_low":2.012646,"estimate":4.420545}},"release_date":"2024-09-12","scaffolds":["duet",null]},"o3_inspect":{"benchmark_name":"METR-Horizon-v1.1","metrics":{"average_score":{"estimate":0.636235},"is_sota":true,"p50_horizon_length":{"ci_high":190.943818,"ci_low":74.615398,"estimate":119.732634},"p80_horizon_length":{"ci_high":57.823511,"ci_low":15.127348,"estimate":29.981603}},"release_date":"2025-04-16","scaffolds":["mtb/start_metr_task,triframe_inspect/triframe_agent","ai_rd_fix_embedding/init_solver,use_tools,triframe_inspect/triframe_agent","re_bench_common/init_solver,use_tools,triframe_inspect/triframe_agent","metr_tasks_swaa/init_solver,generate","triframe_inspect/triframe_agent"]}},"swaa_version":"f6cc84052e2a79dd540766c8a1b15ff399696371"}; let benchmarkData = benchmarkDataV1_1; // Default to v1.1 (new data source) let currentDataVersion = 'TH 1.1'; document.addEventListener('DOMContentLoaded', function() { const dropdown = document.getElementById('version-dropdown'); const dropdownButton = dropdown.querySelector('.version-dropdown-button'); const dropdownLabel = document.getElementById('version-dropdown-label'); const dropdownItems = dropdown.querySelectorAll('.version-dropdown-item'); // Toggle dropdown dropdownButton.addEventListener('click', function(e) { e.stopPropagation(); dropdown.classList.toggle('open'); }); // Close dropdown when clicking outside document.addEventListener('click', function(e) { if (!dropdown.contains(e.target)) { dropdown.classList.remove('open'); } }); // Handle item selection dropdownItems.forEach(function(item) { item.addEventListener('click', function(e) { e.stopPropagation(); const version = this.dataset.version; if (version === currentDataVersion) { dropdown.classList.remove('open'); return; } // Update active state dropdownItems.forEach(i => i.classList.remove('active')); this.classList.add('active'); // Update label and download links const rawDataLink = document.getElementById('raw-data-link'); const downloadBtn = document.getElementById('download-btn'); if (version === 'TH 1.1') { dropdownLabel.innerHTML = '<span class="hidden-when-narrow">Time Horizon 1.1 (Current)</span><span class="shown-when-narrow">TH 1.1</span>'; benchmarkData = benchmarkDataV1_1; if (rawDataLink) rawDataLink.href = '/assets/benchmark_results_1_1.yaml'; if (downloadBtn) downloadBtn.href = '/assets/benchmark_results_1_1.yaml'; } else { dropdownLabel.innerHTML = '<span class="hidden-when-narrow">Time Horizon 1.0</span><span class="shown-when-narrow">TH 1.0</span>'; benchmarkData = benchmarkDataV1; if (rawDataLink) rawDataLink.href = '/assets/benchmark_results_1_0.yaml'; if (downloadBtn) downloadBtn.href = '/assets/benchmark_results_1_0.yaml'; } currentDataVersion = version; // Smoothly transition to new data using the setDataSource function if (typeof window.setDataSource === 'function') { window.setDataSource(); } else { // Fallback: reinitialize chart d3.select('#time-horizon-chart').selectAll('*').remove(); d3.selectAll('.tooltip').remove(); initChart(); } dropdown.classList.remove('open'); }); }); }); document.addEventListener('DOMContentLoaded', function() { const probabilityToggle = document.querySelector('.probability-toggle'); if (probabilityToggle) { const tooltip = document.createElement('div'); tooltip.className = 'probability-tooltip'; tooltip.textContent = "Ex: an LLM with a '50% time horizon' of 10 minutes means we predict a 50% chance the LLM will successfully complete a task from our dataset that's 10 minutes long"; document.body.appendChild(tooltip); probabilityToggle.addEventListener('pointerenter', function(e) { if (e.pointerType !== 'mouse') return; const rect = probabilityToggle.getBoundingClientRect(); const container = document.querySelector('.time-horizon-chart-container'); const containerRect = container ? container.getBoundingClientRect() : null; const tooltipWidth = 300; // matches CSS width // Calculate centered position let leftPos = rect.left + rect.width / 2; let caretOffset = 50; // percentage from left, default centered // Constrain to container if available if (containerRect) { const tooltipLeft = leftPos - tooltipWidth / 2; const tooltipRight = leftPos + tooltipWidth / 2; if (tooltipRight > containerRect.right) { // Would overflow right - align tooltip's right edge with container's right edge const newLeft = containerRect.right - tooltipWidth / 2; // Calculate where the caret should point (as percentage) caretOffset = ((leftPos - (containerRect.right - tooltipWidth)) / tooltipWidth) * 100; leftPos = newLeft; } else if (tooltipLeft < containerRect.left) { // Would overflow left - align tooltip's left edge with container's left edge const newLeft = containerRect.left + tooltipWidth / 2; caretOffset = ((leftPos - containerRect.left) / tooltipWidth) * 100; leftPos = newLeft; } } tooltip.style.left = leftPos + 'px'; tooltip.style.setProperty('--caret-offset', caretOffset + '%'); tooltip.style.top = (rect.top - 10) + 'px'; tooltip.classList.add('visible'); }); probabilityToggle.addEventListener('pointerleave', function(e) { if (e.pointerType !== 'mouse') return; tooltip.classList.remove('visible'); }); } }); Task-Completion Time Horizons of Frontier AI Models We propose measuring AI performance in terms of the length of software tasks AI agents can complete. We show an exponential increase in this time horizon metric over the past 6 years. Read paper View repo Featured research Our AI evaluations research focuses on assessing broad autonomous capabilities and the ability of AI systems to accelerate AI R&D. We also study potential AI behavior that threatens the integrity of evaluations and mitigations for such behavior. View all research General Technical Policy View all research GPT-5.1 Evaluation Results We evaluate whether GPT-5.1 poses significant catastrophic risks via AI self-improvement, rogue replication, or sabotage of AI labs. Read more Measuring AI Ability to Complete Long Tasks We propose measuring AI performance in terms of the length of tasks AI agents can complete. We show that this metric has been consistently exponentially increasing. Read more Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity We found that when developers used AI tools in early 2025, they took 19% longer than withoutβAI made them slower. Read more MALT A dataset of natural and prompted examples of behaviors that threaten evaluation integrity, like generalized reward hacking or sandbagging Read more Measuring autonomous AI capabilities β resource collection An index of our research and guidance on how to measure AI systems' ability to autonomously complete a wide range of multi-hour tasks Read more Early work on monitorability evaluations We show preliminary results on a prototype evaluation that tests monitors' ability to catch AI agents doing side tasks, and AI agents' ability to bypass this monitoring Read more Common Elements of Frontier AI Safety Policies An analysis of the shared components across twelve published frontier AI safety policies, including capability thresholds, model weight security, and deployment mitigations Read more PDF Frontier AI Safety Policies A list of AI companies' frontier safety policies intended to evaluate and manage severe AI risks Read more What should companies share about risks from frontier AI models? We describe areas for risk transparency and specific technical questions that a frontier AI developer could answer. Read more Evaluation reports We conduct evaluations of the autonomous capabilities of frontier AI models, with some in partnership with AI developers such as Anthropic and OpenAI. We do this both to understand the models' capabilities and to pilot third-party evaluator arrangements. GPT-5.1-Codex-Max 19 November 2025 β’ Partnership GPT-5 7 August 2025 β’ Partnership DeepSeek and Qwen 27 June 2025 β’ No company involvement OpenAI o3 and o4-mini 16 April 2025 β’ Partnership Claude 3.7 4 April 2025 β’ Partnership DeepSeek-R1 5 March 2025 β’ No company involvement GPT-4.5 27 February 2025 β’ Partnership DeepSeek-V3 12 February 2025 β’ No company involvement Claude 3.5 Sonnet and o1 31 January 2025 β’ Partnership Claude 3.5 Sonnet (original) 30 October 2024 β’ Partnership o1-preview 12 September 2024 β’ Partnership GPT-4o 7 August 2024 β’ Partnership GPT-4 and Claude 17 March 2023 β’ Partnership View all evaluation reports METR does not accept compensation for this work. Companies such as OpenAI and Anthropic have provided access and compute credits to support evaluation research. We also occasionally evaluate models independently after they are released, without involvement from the model's developers. Recent public reports resulting from this work are above, with additional discussion in the respective system cards. Frontier AI Safety Policies We advise AI developers and governments on implementing risk assessment methodologies for AI. For example, we have advised developers on Frontier AI Safety Policies. Resources on FSPs Google OpenAI Anthropic METR in the press 5 February 2026 10 October 2025 8 August 2025 21 July 2025 20 July 2025 15 July 2025 14 July 2025 11 July 2025 11 July 2025 11 July 2025 11 July 2025 10 July 2025 10 July 2025 2 July 2025 28 March 2025 19 March 2025 21 March 2024 View all press coverage Recent MirrorCode: Evidence that AI can already do some weeks-long coding tasks 10 April 2026 Read more Fine-tuning experiments on CoT controllability 1 April 2026 We find that a small amount of fine-tuning on instruction following in the CoT generalizes to meaningful increases in CoT controllability on an out-of-distribution set of tasks. We fine-tune four reasoning models on small datasets of instruction-following reasoning data and OOD controllability rises from an average of 2.9% to 8.8% across four models. Read more Red-Teaming Anthropic's Internal Agent Monitoring Systems 26 March 2026 A METR staff member spent three weeks red-teaming a subset of Anthropic's internal agent monitoring and security systems, discovering several novel vulnerabilities. Read more Impact of modelling assumptions on time horizon results 20 March 2026 Alexander Barry examines how different modelling choices affect METR's time horizon estimates. Read more We spent 2 hours working in the future 19 March 2026 Thomas Kwa describes a tabletop exercise where METR researchers simulated having access to ~200-hour time horizon AIs. Read more Review of the Anthropic Sabotage Risk Report: Claude Opus 4.6 12 March 2026 External review from METR of Anthropic's Sabotage Risk Report for Claude Opus 4.6 Read more (function() { // Collapsible grid: used by both "METR in the press" and "Evaluation reports" sections. // Breakpoints match Bootstrap: sm=576px, md=768px const SM_BREAKPOINT = 576; const MD_BREAKPOINT = 768; function getResponsiveConfig() { const width = window.innerWidth; if (width < SM_BREAKPOINT) { return { initialRows: 3, rowsPerExpand: 6 }; } else if (width < MD_BREAKPOINT) { return { initialRows: 2, rowsPerExpand: 6 }; } else { return { initialRows: 2, rowsPerExpand: 4 }; } } function initCollapsibleGrid(containerId, buttonId, cardSelector, labelSuffix) { const container = document.getElementById(containerId); const button = document.getElementById(buttonId); if (!container || !button) return; let config = getResponsiveConfig(); let currentRows = config.initialRows; let fullyExpanded = false; function getRowHeight() { const firstCard = container.querySelector(cardSelector); if (!firstCard) return 0; const style = window.getComputedStyle(container); const gap = parseFloat(style.rowGap) || parseFloat(style.gap) || 0; return firstCard.offsetHeight + gap; } function getHeightForRows(rows) { const rowHeight = getRowHeight(); const style = window.getComputedStyle(container); const gap = parseFloat(style.rowGap) || parseFloat(style.gap) || 0; return (rowHeight * rows) - gap; } function getFullHeight() { const currentMax = container.style.maxHeight; container.style.maxHeight = 'none'; const height = container.scrollHeight; container.style.maxHeight = currentMax; return height; } function getTotalRows() { const rowHeight = getRowHeight(); if (rowHeight === 0) return 0; const fullHeight = getFullHeight(); return Math.ceil(fullHeight / rowHeight); } function updateButtonText() { const totalRows = getTotalRows(); const nextRows = currentRows + config.rowsPerExpand; if (nextRows >= totalRows) { button.textContent = 'View all ' + labelSuffix; } else { button.textContent = 'View more ' + labelSuffix; } } function updateVisibility() { const currentHeight = getHeightForRows(currentRows); const fullHeight = getFullHeight(); const hasOverflow = fullHeight > currentHeight + 1; container.style.maxHeight = currentHeight + 'px'; container.classList.add('collapsed'); button.style.display = hasOverflow ? '' : 'none'; updateButtonText(); } function expand() { const totalRows = getTotalRows(); const nextRows = currentRows + config.rowsPerExpand; const isFullExpand = nextRows >= totalRows; currentRows = isFullExpand ? totalRows : nextRows; const targetHeight = isFullExpand ? getFullHeight() : getHeightForRows(currentRows); container.classList.remove('collapsed'); container.classList.add('expanding'); container.offsetHeight; container.style.maxHeight = targetHeight + 'px'; setTimeout(() => { container.classList.remove('expanding'); if (isFullExpand) { fullyExpanded = true; container.style.maxHeight = ''; button.classList.add('fading-out'); setTimeout(() => { button.style.display = 'none'; }, 200); } else { container.classList.add('collapsed'); updateButtonText(); } }, 400); } button.addEventListener('click', function(e) { e.preventDefault(); expand(); }); updateVisibility(); window.addEventListener('resize', function() { config = getResponsiveConfig(); if (fullyExpanded) return; const fullHeight = getFullHeight(); const currentHeight = getHeightForRows(currentRows); if (fullHeight <= currentHeight + 1) { button.style.display = 'none'; } else { button.style.display = ''; updateButtonText(); } container.style.maxHeight = currentHeight + 'px'; }); } initCollapsibleGrid('media-card-container', 'view-all-press-btn', '.media-card', 'press coverage'); initCollapsibleGrid('evaluation-reports-container', 'view-all-reports-btn', '.evaluation-report-card', 'evaluation reports'); })(); METR (pronounced 'meter') is a research nonprofit that scientifically measures whether and when AI systems might threaten catastrophic harm to society. window.CustomSubstackWidget = { substackUrl: "metr.substack.com", placeholder: "Join our newsletter", buttonText: "Subscribe", theme: "custom", colors: { primary: "var(--color-bg-alt4)", input: "#FFFFFF", email: "#353535", text: "#000000", } }; Research Research Notes Updates Follow METR Substack Substack twitter X (Twitter) LinkedIn GitHub Company About Us Careers Donate Β© 2026 METR. All rights reserved. window.dataLayer = window.dataLayer || []; function gtag(){dataLayer.push(arguments);} gtag('js', new Date()); gtag('config', 'G-MMLYWX6QCN'); document.addEventListener('DOMContentLoaded', () => { littlefoot.littlefoot({ activateOnHover: true, dismissOnUnhover: true, activateCallback: function(tooltip, button) { const match = button.id.match(/lf-fnref:(\d+):?\d*/); if (match) { const downLink = tooltip.querySelector('.littlefoot__content .a-fn-ref-raw'); // move it inside the tag before it (i.e., the paragraph tag) if (downLink) { downLink.setAttribute('href', `#fn:${match[1]}`); const content = tooltip.querySelector('.littlefoot__content'); const tags = content.children; if (tags.length > 0) { // find the last tag that isn't this link itself let lastTag = tags[tags.length - 1]; if (lastTag === downLink) { lastTag = tags[tags.length - 2]; } lastTag.appendChild(downLink); } } } }, buttonTemplate: `<button aria-label="Footnote <% number %>" class="littlefoot__button" id="<% reference %>" /> <span class="fn-ref-raw">[<% reference %>]</span> </button>`, contentTemplate: `<aside alt="Footnote <% number %>" class="littlefoot__popover" id="fncontent:<% id %>" > <div class="littlefoot__wrapper"> <div class="littlefoot__content"> <% content %> <a data-footnote-ref="<% reference %>" class="a-fn-ref-raw"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 640" style="height: 1rem"> <path d="M297.4 566.6C309.9 579.1 330.2 579.1 342.7 566.6L502.7 406.6C515.2 394.1 515.2 373.8 502.7 361.3C490.2 348.8 469.9 348.8 457.4 361.3L352 466.7L352 96C352 78.3 337.7 64 320 64C302.3 64 288 78.3 288 96L288 466.7L182.6 361.3C170.1 348.8 149.8 348.8 137.3 361.3C124.8 373.8 124.8 394.1 137.3 406.6L297.3 566.6z"/> </svg></a> </div> </div> <div class="littlefoot__tooltip"></div> </aside>` }); // Down arrow from: Font Awesome Free v7.0.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free Copyright 2025 Fonticons, Inc. */ transformFootnoteReferences(); markLastFootnotesInGroupsByParagraph(); wrapFootnotesPrecedingCommasInNoWrapSpans(); addSpacesBetweenFootnotes(); scrollDownToFootnotesOnClick(); });Page Captures
Archived Files
Total Size: 9.4 MB
