Spaces:
Running
Running
Eve commited on
Commit ·
f2986f6
1
Parent(s): 868283e
ggml : IQ4_NL sgemm + Q4_0 AVX optimization (llama/9422)
Browse files* squashed
readd my iq4_nl sgemm PR https://github.com/ggerganov/llama.cpp/pull/8049
have ggml_vec_dot_q4_0 do two blocks per loop for avx
try out f16c ggml_vec_dot_iq4_nl, but it's not really faster. as per https://github.com/ggerganov/llama.cpp/pull/8549 we can calculate several blocks at a time with no issue
* shuffle
* remove f16c iq4_nl as i cant make it faster than before
- ggml/src/ggml-quants.c +33 -36
ggml/src/ggml-quants.c
CHANGED
|
@@ -230,6 +230,12 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
|
|
| 230 |
|
| 231 |
return _mm_packus_epi16( bytes1, bytes2);
|
| 232 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
#endif
|
| 234 |
#elif defined(__SSSE3__)
|
| 235 |
// horizontally add 4x4 floats
|
|
@@ -4206,37 +4212,37 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
| 4206 |
|
| 4207 |
sumf = hsum_float_8(acc);
|
| 4208 |
#elif defined(__AVX__)
|
| 4209 |
-
|
| 4210 |
-
__m256 acc = _mm256_setzero_ps();
|
| 4211 |
-
|
| 4212 |
-
// Main loop
|
| 4213 |
-
for (; ib < nb; ++ib) {
|
| 4214 |
-
// Compute combined scale for the block
|
| 4215 |
-
const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
|
| 4216 |
-
|
| 4217 |
-
const __m128i lowMask = _mm_set1_epi8(0xF);
|
| 4218 |
-
const __m128i off = _mm_set1_epi8(8);
|
| 4219 |
-
|
| 4220 |
-
const __m128i tmp = _mm_loadu_si128((const __m128i *)x[ib].qs);
|
| 4221 |
-
|
| 4222 |
-
__m128i bx_0 = _mm_and_si128(lowMask, tmp);
|
| 4223 |
-
__m128i by_0 = _mm_loadu_si128((const __m128i *)y[ib].qs);
|
| 4224 |
-
bx_0 = _mm_sub_epi8(bx_0, off);
|
| 4225 |
-
const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
|
| 4226 |
-
|
| 4227 |
-
bx_0 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
|
| 4228 |
-
by_0 = _mm_loadu_si128((const __m128i *)(y[ib].qs + 16));
|
| 4229 |
-
bx_0 = _mm_sub_epi8(bx_0, off);
|
| 4230 |
-
const __m128i i32_1 = mul_sum_i8_pairs(bx_0, by_0);
|
| 4231 |
|
| 4232 |
-
|
| 4233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4234 |
|
| 4235 |
-
|
| 4236 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4237 |
}
|
| 4238 |
|
| 4239 |
-
sumf = hsum_float_8(
|
| 4240 |
#elif defined(__SSSE3__)
|
| 4241 |
// set constants
|
| 4242 |
const __m128i lowMask = _mm_set1_epi8(0xF);
|
|
@@ -11819,15 +11825,6 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
| 11819 |
#endif
|
| 11820 |
}
|
| 11821 |
|
| 11822 |
-
|
| 11823 |
-
#if defined(__AVX__)
|
| 11824 |
-
static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
|
| 11825 |
-
const __m128i ax = _mm_sign_epi8(x, x);
|
| 11826 |
-
const __m128i sy = _mm_sign_epi8(y, x);
|
| 11827 |
-
return _mm_maddubs_epi16(ax, sy);
|
| 11828 |
-
}
|
| 11829 |
-
#endif
|
| 11830 |
-
|
| 11831 |
#if defined(__AVX2__)
|
| 11832 |
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
| 11833 |
const __m256i ax = _mm256_sign_epi8(x, x);
|
|
|
|
| 230 |
|
| 231 |
return _mm_packus_epi16( bytes1, bytes2);
|
| 232 |
}
|
| 233 |
+
|
| 234 |
+
static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
|
| 235 |
+
const __m128i ax = _mm_sign_epi8(x, x);
|
| 236 |
+
const __m128i sy = _mm_sign_epi8(y, x);
|
| 237 |
+
return _mm_maddubs_epi16(ax, sy);
|
| 238 |
+
}
|
| 239 |
#endif
|
| 240 |
#elif defined(__SSSE3__)
|
| 241 |
// horizontally add 4x4 floats
|
|
|
|
| 4212 |
|
| 4213 |
sumf = hsum_float_8(acc);
|
| 4214 |
#elif defined(__AVX__)
|
| 4215 |
+
const __m128i mone = _mm_set1_epi16(1);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4216 |
|
| 4217 |
+
__m256 accum1 = _mm256_setzero_ps();
|
| 4218 |
+
__m256 accum2 = _mm256_setzero_ps();
|
| 4219 |
+
for (; ib + 1 < nb; ib += 2) {
|
| 4220 |
+
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
|
| 4221 |
+
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
|
| 4222 |
+
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
|
| 4223 |
+
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
|
| 4224 |
+
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
|
| 4225 |
+
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
|
| 4226 |
|
| 4227 |
+
const __m128i q4b_1_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_1), _mm_set1_epi8(8));
|
| 4228 |
+
const __m128i q4b_1_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_1, 4)), _mm_set1_epi8(8));
|
| 4229 |
+
const __m128i q4b_2_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_2), _mm_set1_epi8(8));
|
| 4230 |
+
const __m128i q4b_2_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_2, 4)), _mm_set1_epi8(8));
|
| 4231 |
+
const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
|
| 4232 |
+
const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
|
| 4233 |
+
const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
|
| 4234 |
+
const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
|
| 4235 |
+
const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
|
| 4236 |
+
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
|
| 4237 |
+
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
|
| 4238 |
+
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
|
| 4239 |
+
accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
|
| 4240 |
+
_mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
|
| 4241 |
+
accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
|
| 4242 |
+
_mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
|
| 4243 |
}
|
| 4244 |
|
| 4245 |
+
sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
| 4246 |
#elif defined(__SSSE3__)
|
| 4247 |
// set constants
|
| 4248 |
const __m128i lowMask = _mm_set1_epi8(0xF);
|
|
|
|
| 11825 |
#endif
|
| 11826 |
}
|
| 11827 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11828 |
#if defined(__AVX2__)
|
| 11829 |
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
| 11830 |
const __m256i ax = _mm256_sign_epi8(x, x);
|