@inproceedings{vaswani2017, title={Attention is all you need}, author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia}, booktitle={Advances in neural information processing systems}, volume={30}, year={2017} } @inproceedings{press2022, title={Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation}, author={Press, Ofir and Smith, Noah A and Lewis, Mike}, booktitle={International Conference on Learning Representations}, year={2022} } @article{shazeer2020, title={GLU variants improve transformer}, author={Shazeer, Noam}, journal={arXiv preprint arXiv:2002.05202}, year={2020} } @book{tolstoy, title={War and Peace}, author={Tolstoy, Leo}, publisher={Project Gutenberg}, note={Dataset} } @misc{bigcode, title={The Stack}, author={BigCode Project}, year={2022}, note={Dataset} }