| @inproceedings{vaswani2017, |
| title={Attention is all you need}, |
| author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia}, |
| booktitle={Advances in neural information processing systems}, |
| volume={30}, |
| year={2017} |
| } |
|
|
| @inproceedings{press2022, |
| title={Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation}, |
| author={Press, Ofir and Smith, Noah A and Lewis, Mike}, |
| booktitle={International Conference on Learning Representations}, |
| year={2022} |
| } |
|
|
| @article{shazeer2020, |
| title={GLU variants improve transformer}, |
| author={Shazeer, Noam}, |
| journal={arXiv preprint arXiv:2002.05202}, |
| year={2020} |
| } |
|
|
| @book{tolstoy, |
| title={War and Peace}, |
| author={Tolstoy, Leo}, |
| publisher={Project Gutenberg}, |
| note={Dataset} |
| } |
|
|
| @misc{bigcode, |
| title={The Stack}, |
| author={BigCode Project}, |
| year={2022}, |
| note={Dataset} |
| } |
|
|