% Clean references for model-evaluation memo
% 10 unique entries — deduplicated, verified, formatted
% Generated 2026-04-11

% ============================================================
%  Foundation Models
% ============================================================

@inproceedings{vaswani2017attention,
  author    = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and Kaiser, {\L}ukasz and Polosukhin, Illia},
  title     = {Attention Is All You Need},
  booktitle = {Advances in Neural Information Processing Systems},
  year      = {2017},
  eprint    = {1706.03762},
  archiveprefix = {arXiv},
  primaryclass  = {cs.CL},
}

@inproceedings{brown2020gpt3,
  author    = {Brown, Tom B. and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D. and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel M. and Wu, Jeffrey and Winter, Clemens and Hesse, Christopher and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
  title     = {Language Models are Few-Shot Learners},
  booktitle = {Advances in Neural Information Processing Systems},
  year      = {2020},
  eprint    = {2005.14165},
  archiveprefix = {arXiv},
  primaryclass  = {cs.CL},
}

@article{chowdhery2023palm,
  author    = {Chowdhery, Aakanksha and Narang, Sharan and Devlin, Jacob and Bosma, Maarten and Mishra, Gaurav and Roberts, Adam and Barham, Paul and Chung, Hyung Won and Sutton, Charles and Gehrmann, Sebastian and others},
  title     = {{PaLM}: Scaling Language Modeling with Pathways},
  journal   = {Journal of Machine Learning Research},
  year      = {2023},
  volume    = {24},
  number    = {240},
  pages     = {1--113},
  eprint    = {2204.02311},
  archiveprefix = {arXiv},
  primaryclass  = {cs.CL},
}

@misc{touvron2023llama,
  author    = {Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and Lachaux, Marie-Anne and Lacroix, Timoth{\'e}e and Rozi{\`e}re, Baptiste and Goyal, Naman and Hambro, Eric and Azhar, Faisal and Rodriguez, Aurelien and Joulin, Armand and Grave, Edouard and Lample, Guillaume},
  title     = {{LLaMA}: Open and Efficient Foundation Language Models},
  year      = {2023},
  eprint    = {2302.13971},
  archiveprefix = {arXiv},
  primaryclass  = {cs.CL},
}

% ============================================================
%  Benchmarks and Evaluation
% ============================================================

@inproceedings{hendrycks2021mmlu,
  author    = {Hendrycks, Dan and Burns, Collin and Basart, Steven and Zou, Andy and Mazeika, Mantas and Song, Dawn and Steinhardt, Jacob},
  title     = {Measuring Massive Multitask Language Understanding},
  booktitle = {International Conference on Learning Representations},
  year      = {2021},
  eprint    = {2009.03300},
  archiveprefix = {arXiv},
  primaryclass  = {cs.CY},
}

@misc{clark2018arc,
  author    = {Clark, Peter and Cowhey, Isaac and Etzioni, Oren and Khot, Tushar and Sabharwal, Ashish and Schoenick, Carissa and Tafjord, Oyvind},
  title     = {Think you have Solved Question Answering? {Try ARC}, the {AI2} Reasoning Challenge},
  year      = {2018},
  eprint    = {1803.05457},
  archiveprefix = {arXiv},
  primaryclass  = {cs.AI},
}

@inproceedings{zellers2019hellaswag,
  author    = {Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},
  title     = {{HellaSwag}: Can a Machine Really Finish Your Sentence?},
  booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
  year      = {2019},
  doi       = {10.18653/v1/P19-1472},
}

@article{liang2023holistic,
  author    = {Liang, Percy and Bommasani, Rishi and Lee, Tony and Tsipras, Dimitris and Soylu, Dilara and Yasunaga, Michihiro and Zhang, Yian and Narayanan, Deepak and Wu, Yuhuai and Kumar, Ananya and others},
  title     = {Holistic Evaluation of Language Models},
  journal   = {Transactions on Machine Learning Research},
  year      = {2023},
  eprint    = {2211.09110},
  archiveprefix = {arXiv},
  primaryclass  = {cs.CL},
}

@inproceedings{zheng2023judging,
  author    = {Zheng, Lianmin and Chiang, Wei-Lin and Sheng, Ying and Zhuang, Siyuan and Wu, Zhanghao and Zhuang, Yonghao and Lin, Zi and Li, Zhuohan and Li, Dacheng and Xing, Eric P. and Zhang, Hao and Gonzalez, Joseph E. and Stoica, Ion},
  title     = {Judging {LLM}-as-a-Judge with {MT-Bench} and {Chatbot Arena}},
  booktitle = {Advances in Neural Information Processing Systems},
  year      = {2023},
  eprint    = {2306.05685},
  archiveprefix = {arXiv},
  primaryclass  = {cs.CL},
}

% ============================================================
%  Prompting and Reasoning
% ============================================================

@inproceedings{wei2022chainofthought,
  author    = {Wei, Jason and Wang, Xuezhi and Schuurmans, Dale and Bosma, Maarten and Ichter, Brian and Xia, Fei and Chi, Ed H. and Le, Quoc V. and Zhou, Denny},
  title     = {Chain-of-Thought Prompting Elicits Reasoning in Large Language Models},
  booktitle = {Advances in Neural Information Processing Systems},
  year      = {2022},
  eprint    = {2201.11903},
  archiveprefix = {arXiv},
  primaryclass  = {cs.CL},
}