{"schema_version":"papers/paper-detail-v1","title":"MINDGAMES: A Live Arena for Evaluating Social and Strategic Reasoning in Multi-Agent LLMs","surface":"papers","opportunity_kernel":{"paper_id":"d2f602ff-9154-4719-8afd-8b58d90344a4","title":"MINDGAMES: A Live Arena for Evaluating Social and Strategic Reasoning in Multi-Agent LLMs","authors":["Kevin Wang","Anna Thöni","Benjamin Kempinski","Bobby Cheng","Jianzhu Yao","Benjamin Finch","Leon Guertler","Viraj Nadkarni","Yihan Jiang","Aliaksei Korshuk","Alexander Buyantuev","Ilya Makarov","Siyuan Wu","Yu-Chi Cheng","Yan-Ru Ju","Ti-Rong Wu","I-Hsuan Chu","Yu-Yu Yang","I-Chen Wu","Yitian Huang","Qinlu Cao","Yiheng Sun","Yuhong Dai","Hongkun Yao","Jingxuan Fu","Jiwei Zhang","Hao Liao","Mossimo Ebeling","Govind Arun","Sadhvik Bathini","Mihir S Arya","Avinash Anish","Aditya Ranjan","Kirtana Sunil Phatnani","Paval KS","Vrushali Mehta","Aravind S","Nikhil Arora","Tanya Upadhyay","Amol Bandagale","Yuan Lu","ChunEn Hsiao","YuTing Lin","Arvin Chung","Jerry John Thomas","Mathieu Laurière","Leshem Choshen","Yoram Bachrach","Pramod Viswanath","Maria Polukarov","Cheston Tan","Tal Kachman","Atlas Wang"],"arxiv_id":"2605.29512v1","doi":null,"published_at":"2026-05-28T07:33:47.000Z","score_object":{"overall":{"value":3,"scale":"0-10","confidence":0.85,"confidence_reason":"Backfilled from persisted papers.viability_score.","model_version":"phase0-backfill-v1","pipeline_version":"phase0-kernel-v1","computed_at":"2026-05-29T20:18:26.550Z","fresh_until":"2026-06-28T20:18:26.550Z","is_stale":false,"source_count":1,"missingness":[]},"technical":{"value":1.4,"scale":"0-10","confidence":0.55,"confidence_reason":"Backfilled from paper_extraction_scorecards.reconstruction_score.","model_version":"phase0-backfill-v1","pipeline_version":"phase0-kernel-v1","computed_at":"2026-05-29T20:34:22.303Z","fresh_until":"2026-06-12T20:34:22.303Z","is_stale":false,"source_count":1,"missingness":["reproducibility_results.reproducibility_score","deployability_scores.score"]},"commercial":{"value":5,"scale":"0-10","confidence":0.75,"confidence_reason":"Backfilled from persisted commercial_flags and repo availability.","model_version":"phase0-backfill-v1","pipeline_version":"phase0-kernel-v1","computed_at":"2026-05-29T20:18:26.550Z","fresh_until":"2026-06-28T20:18:26.550Z","is_stale":false,"source_count":2,"missingness":[]},"market":{"value":5.5,"scale":"0-10","confidence":0.45,"confidence_reason":"Heuristic market score materialized from paper metadata, repo availability, and deployability.","model_version":"phase0-backfill-v1","pipeline_version":"phase0-kernel-v1","computed_at":"2026-05-29T20:35:42.578Z","fresh_until":"2026-06-12T20:35:42.578Z","is_stale":false,"source_count":1,"missingness":[]},"team":{"value":5.45,"scale":"0-10","confidence":0.42,"confidence_reason":"Heuristic fallback from paper author count and extracted affiliations.","model_version":"phase0-backfill-v1","pipeline_version":"phase0-kernel-v1","computed_at":"2026-05-29T20:18:26.550Z","fresh_until":"2026-06-12T20:18:26.550Z","is_stale":false,"source_count":2,"missingness":[]},"methodology":{"value":4.7,"scale":"0-10","confidence":0.82,"confidence_reason":"Backfilled from paper_extraction_scorecards.total_score.","model_version":"phase0-backfill-v1","pipeline_version":"phase0-kernel-v1","computed_at":"2026-05-29T20:18:50.096Z","fresh_until":"2026-06-28T20:18:50.096Z","is_stale":false,"source_count":2,"missingness":[]}},"evidence_receipt":{"freshness":"fresh","proof_status":"unverified","repo_status":"active","references_count":0,"source_count":4,"coverage":0.8333,"missingness":["references"],"unresolved_unknowns":[],"last_verification_at":"2026-05-29T20:34:22.303Z"},"lineage_hash":"8bc5c537706e38d38b27222ccea02569b82ebe1c24f8146b4eebc4310ed4d343"},"distribution":null,"replication_evidence":[],"author_dna":[]}