%PDF-1.5 % 1 0 obj << /S /GoTo /D (section.1) >> endobj 4 0 obj (Introduction) endobj 5 0 obj << /S /GoTo /D (subsection.1.1) >> endobj 8 0 obj (Related Work) endobj 9 0 obj << /S /GoTo /D (section.2) >> endobj 12 0 obj (Preliminaries on Factorized Neural Layers) endobj 13 0 obj << /S /GoTo /D (subsection.2.1) >> endobj 16 0 obj (Fully-Connected Layers) endobj 17 0 obj << /S /GoTo /D (subsection.2.2) >> endobj 20 0 obj (Convolutional Layers) endobj 21 0 obj << /S /GoTo /D (subsection.2.3) >> endobj 24 0 obj (Multi-Head Attention) endobj 25 0 obj << /S /GoTo /D (section.3) >> endobj 28 0 obj (Initialization and Regularization) endobj 29 0 obj << /S /GoTo /D (subsection.3.1) >> endobj 32 0 obj (Spectral Initialization) endobj 33 0 obj << /S /GoTo /D (subsection.3.2) >> endobj 36 0 obj (Frobenius Decay) endobj 37 0 obj << /S /GoTo /D (subsection.3.3) >> endobj 40 0 obj (Initialization and Regularization in the Presence of Normalization) endobj 41 0 obj << /S /GoTo /D (section.4) >> endobj 44 0 obj (Compressed Model Training: Low-Rank, Sparse, and Tensorial) endobj 45 0 obj << /S /GoTo /D (section.5) >> endobj 48 0 obj (Overcomplete Knowledge Distillation) endobj 49 0 obj << /S /GoTo /D (section.6) >> endobj 52 0 obj (Multi-Head Attention as Factorized Quadratic Forms) endobj 53 0 obj << /S /GoTo /D (section.7) >> endobj 56 0 obj (Conclusion) endobj 57 0 obj << /S /GoTo /D (appendix.A) >> endobj 60 0 obj (Generalization of Factorized Layers) endobj 61 0 obj << /S /GoTo /D (appendix.B) >> endobj 64 0 obj (Proof of Claim 3.1) endobj 65 0 obj << /S /GoTo /D (appendix.C) >> endobj 68 0 obj (Experimental Details for Training Convolutional Networks) endobj 69 0 obj << /S /GoTo /D (appendix.D) >> endobj 72 0 obj (Experimental Details for Training Transformer Models) endobj 73 0 obj << /S /GoTo /D (appendix.E) >> endobj 76 0 obj (Past Work on Knowledge Distillation) endobj 77 0 obj << /S /GoTo /D [78 0 R /Fit] >> endobj 106 0 obj << /Length 3767 /Filter /FlateDecode >> stream x}ZY6~_- KYgm'kOJ