File size: 964 Bytes
148b631
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
@inproceedings{vaswani2017,
  title={Attention is all you need},
  author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
  booktitle={Advances in neural information processing systems},
  volume={30},
  year={2017}
}

@inproceedings{press2022,
  title={Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation},
  author={Press, Ofir and Smith, Noah A and Lewis, Mike},
  booktitle={International Conference on Learning Representations},
  year={2022}
}

@article{shazeer2020,
  title={GLU variants improve transformer},
  author={Shazeer, Noam},
  journal={arXiv preprint arXiv:2002.05202},
  year={2020}
}

@book{tolstoy,
  title={War and Peace},
  author={Tolstoy, Leo},
  publisher={Project Gutenberg},
  note={Dataset}
}

@misc{bigcode,
  title={The Stack},
  author={BigCode Project},
  year={2022},
  note={Dataset}
}