clang 22.0.0git
avxvnniint8intrin.h
Go to the documentation of this file.
1/*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9#ifndef __IMMINTRIN_H
10#error \
11 "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVXVNNIINT8INTRIN_H
15#define __AVXVNNIINT8INTRIN_H
16
17/ clang-format off
18/ Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
19/ corresponding signed 8-bit integers in \a __B, producing 4 intermediate
20/ signed 16-bit results. Sum these 4 results with the corresponding
21/ 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
22/
23/ \headerfile <x86intrin.h>
24/
25/ \code
26/ _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
27/ \endcode
28/
29/ This intrinsic corresponds to the \c VPDPBSSD instruction.
30/
31/ \param __A
32/ A 128-bit vector of [16 x char].
33/ \param __B
34/ A 128-bit vector of [16 x char].
35/ \returns
36/ A 128-bit vector of [4 x int].
37/
38/ \code{.operation}
39/ FOR j := 0 to 3
40/ tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
41/ tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
42/ tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
43/ tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
44/ dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
45/ ENDFOR
46/ dst[MAX:128] := 0
47/ \endcode
48/ clang-format on
49#define _mm_dpbssd_epi32(__W, __A, __B) \
50 ((__m128i)__builtin_ia32_vpdpbssd128((__v4si)(__W), (__v16qi)(__A), \
51 (__v16qi)(__B)))
52
53/ clang-format off
54/ Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
55/ corresponding signed 8-bit integers in \a __B, producing 4 intermediate
56/ signed 16-bit results. Sum these 4 results with the corresponding
57/ 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
58/
59/ \headerfile <x86intrin.h>
60/
61/ \code
62/ _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
63/ \endcode
64/
65/ This intrinsic corresponds to the \c VPDPBSSD instruction.
66/
67/ \param __A
68/ A 256-bit vector of [32 x char].
69/ \param __B
70/ A 256-bit vector of [32 x char].
71/ \returns
72/ A 256-bit vector of [8 x int].
73/
74/ \code{.operation}
75/ FOR j := 0 to 7
76/ tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
77/ tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
78/ tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
79/ tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
80/ dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
81/ ENDFOR
82/ dst[MAX:256] := 0
83/ \endcode
84/ clang-format on
85#define _mm256_dpbssd_epi32(__W, __A, __B) \
86 ((__m256i)__builtin_ia32_vpdpbssd256((__v8si)(__W), (__v32qi)(__A), \
87 (__v32qi)(__B)))
88
89/ clang-format off
90/ Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
91/ corresponding signed 8-bit integers in \a __B, producing 4 intermediate
92/ signed 16-bit results. Sum these 4 results with the corresponding
93/ 32-bit integer in \a __W with signed saturation, and store the packed
94/ 32-bit results in \a dst.
95/
96/ \headerfile <x86intrin.h>
97/
98/ \code
99/ _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
100/ \endcode
101/
102/ This intrinsic corresponds to the \c VPDPBSSDS instruction.
103/
104/ \param __A
105/ A 128-bit vector of [16 x char].
106/ \param __B
107/ A 128-bit vector of [16 x char].
108/ \returns
109/ A 128-bit vector of [4 x int].
110/
111/ \code{.operation}
112/ FOR j := 0 to 3
113/ tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
114/ tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
115/ tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
116/ tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
117/ dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
118/ ENDFOR
119/ dst[MAX:128] := 0
120/ \endcode
121/ clang-format on
122#define _mm_dpbssds_epi32(__W, __A, __B) \
123 ((__m128i)__builtin_ia32_vpdpbssds128((__v4si)(__W), (__v16qi)(__A), \
124 (__v16qi)(__B)))
125
126/ clang-format off
127/ Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
128/ corresponding signed 8-bit integers in \a __B, producing 4 intermediate
129/ signed 16-bit results. Sum these 4 results with the corresponding
130/ 32-bit integer in \a __W with signed saturation, and store the packed
131/ 32-bit results in \a dst.
132/
133/ \headerfile <x86intrin.h>
134/
135/ \code
136/ _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
137/ \endcode
138/
139/ This intrinsic corresponds to the \c VPDPBSSDS instruction.
140/
141/ \param __A
142/ A 256-bit vector of [32 x char].
143/ \param __B
144/ A 256-bit vector of [32 x char].
145/ \returns
146/ A 256-bit vector of [8 x int].
147/
148/ \code{.operation}
149/ FOR j := 0 to 7
150/ tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
151/ tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
152/ tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
153/ tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
154/ dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
155/ ENDFOR
156/ dst[MAX:256] := 0
157/ \endcode
158/ clang-format on
159#define _mm256_dpbssds_epi32(__W, __A, __B) \
160 ((__m256i)__builtin_ia32_vpdpbssds256((__v8si)(__W), (__v32qi)(__A), \
161 (__v32qi)(__B)))
162
163/ clang-format off
164/ Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
165/ corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
166/ signed 16-bit results. Sum these 4 results with the corresponding
167/ 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
168/
169/ \headerfile <x86intrin.h>
170/
171/ \code
172/ _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
173/ \endcode
174/
175/ This intrinsic corresponds to the \c VPDPBSUD instruction.
176/
177/ \param __A
178/ A 128-bit vector of [16 x char].
179/ \param __B
180/ A 128-bit vector of [16 x unsigned char].
181/ \returns
182/ A 128-bit vector of [4 x int].
183/
184/ \code{.operation}
185/ FOR j := 0 to 3
186/ tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
187/ tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
188/ tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
189/ tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
190/ dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
191/ ENDFOR
192/ dst[MAX:128] := 0
193/ \endcode
194/ clang-format on
195#define _mm_dpbsud_epi32(__W, __A, __B) \
196 ((__m128i)__builtin_ia32_vpdpbsud128((__v4si)(__W), (__v16qi)(__A), \
197 (__v16qu)(__B)))
198
199/ clang-format off
200/ Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
201/ corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
202/ signed 16-bit results. Sum these 4 results with the corresponding
203/ 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
204/
205/ \headerfile <x86intrin.h>
206/
207/ \code
208/ _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
209/ \endcode
210/
211/ This intrinsic corresponds to the \c VPDPBSUD instruction.
212/
213/ \param __A
214/ A 256-bit vector of [32 x char].
215/ \param __B
216/ A 256-bit vector of [32 x unsigned char].
217/ \returns
218/ A 256-bit vector of [8 x int].
219/
220/ \code{.operation}
221/ FOR j := 0 to 7
222/ tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
223/ tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
224/ tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
225/ tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
226/ dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
227/ ENDFOR
228/ dst[MAX:256] := 0
229/ \endcode
230/ clang-format on
231#define _mm256_dpbsud_epi32(__W, __A, __B) \
232 ((__m256i)__builtin_ia32_vpdpbsud256((__v8si)(__W), (__v32qi)(__A), \
233 (__v32qu)(__B)))
234
235/ clang-format off
236/ Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
237/ corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
238/ signed 16-bit results. Sum these 4 results with the corresponding
239/ 32-bit integer in \a __W with signed saturation, and store the packed
240/ 32-bit results in \a dst.
241/
242/ \headerfile <x86intrin.h>
243/
244/ \code
245/ _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
246/ \endcode
247/
248/ This intrinsic corresponds to the \c VPDPBSUDS instruction.
249/
250/ \param __A
251/ A 128-bit vector of [16 x char].
252/ \param __B
253/ A 128-bit vector of [16 x unsigned char].
254/ \returns
255/ A 128-bit vector of [4 x int].
256/
257/ \code{.operation}
258/ FOR j := 0 to 3
259/ tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
260/ tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
261/ tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
262/ tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
263/ dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
264/ ENDFOR
265/ dst[MAX:128] := 0
266/ \endcode
267/ clang-format on
268#define _mm_dpbsuds_epi32(__W, __A, __B) \
269 ((__m128i)__builtin_ia32_vpdpbsuds128((__v4si)(__W), (__v16qi)(__A), \
270 (__v16qu)(__B)))
271
272/ clang-format off
273/ Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
274/ corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
275/ signed 16-bit results. Sum these 4 results with the corresponding
276/ 32-bit integer in \a __W with signed saturation, and store the packed
277/ 32-bit results in \a dst.
278/
279/ \headerfile <x86intrin.h>
280/
281/ \code
282/ _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
283/ \endcode
284/
285/ This intrinsic corresponds to the \c VPDPBSUDS instruction.
286/
287/ \param __A
288/ A 256-bit vector of [32 x char].
289/ \param __B
290/ A 256-bit vector of [32 x unsigned char].
291/ \returns
292/ A 256-bit vector of [8 x int].
293/
294/ \code{.operation}
295/ FOR j := 0 to 7
296/ tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
297/ tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
298/ tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
299/ tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
300/ dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
301/ ENDFOR
302/ dst[MAX:256] := 0
303/ \endcode
304/ clang-format on
305#define _mm256_dpbsuds_epi32(__W, __A, __B) \
306 ((__m256i)__builtin_ia32_vpdpbsuds256((__v8si)(__W), (__v32qi)(__A), \
307 (__v32qu)(__B)))
308
309/ clang-format off
310/ Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
311/ corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
312/ signed 16-bit results. Sum these 4 results with the corresponding
313/ 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
314/
315/ \headerfile <x86intrin.h>
316/
317/ \code
318/ _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
319/ \endcode
320/
321/ This intrinsic corresponds to the \c VPDPBUUD instruction.
322/
323/ \param __A
324/ A 128-bit vector of [16 x unsigned char].
325/ \param __B
326/ A 128-bit vector of [16 x unsigned char].
327/ \returns
328/ A 128-bit vector of [4 x int].
329/
330/ \code{.operation}
331/ FOR j := 0 to 3
332/ tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
333/ tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
334/ tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
335/ tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
336/ dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
337/ ENDFOR
338/ dst[MAX:128] := 0
339/ \endcode
340/ clang-format on
341#define _mm_dpbuud_epi32(__W, __A, __B) \
342 ((__m128i)__builtin_ia32_vpdpbuud128((__v4si)(__W), (__v16qu)(__A), \
343 (__v16qu)(__B)))
344
345/ clang-format off
346/ Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
347/ corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
348/ signed 16-bit results. Sum these 4 results with the corresponding
349/ 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
350/
351/ \headerfile <x86intrin.h>
352/
353/ \code
354/ _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
355/ \endcode
356/
357/ This intrinsic corresponds to the \c VPDPBUUD instruction.
358/
359/ \param __A
360/ A 256-bit vector of [32 x unsigned char].
361/ \param __B
362/ A 256-bit vector of [32 x unsigned char].
363/ \returns
364/ A 256-bit vector of [8 x int].
365/
366/ \code{.operation}
367/ FOR j := 0 to 7
368/ tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
369/ tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
370/ tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
371/ tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
372/ dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
373/ ENDFOR
374/ dst[MAX:256] := 0
375/ \endcode
376/ clang-format on
377#define _mm256_dpbuud_epi32(__W, __A, __B) \
378 ((__m256i)__builtin_ia32_vpdpbuud256((__v8si)(__W), (__v32qu)(__A), \
379 (__v32qu)(__B)))
380
381/ clang-format off
382/ Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
383/ corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
384/ signed 16-bit results. Sum these 4 results with the corresponding
385/ 32-bit integer in \a __W with signed saturation, and store the packed
386/ 32-bit results in \a dst.
387/
388/ \headerfile <x86intrin.h>
389/
390/ \code
391/ _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
392/ \endcode
393/
394/ This intrinsic corresponds to the \c VPDPBUUDS instruction.
395/
396/ \param __A
397/ A 128-bit vector of [16 x unsigned char].
398/ \param __B
399/ A 128-bit vector of [16 x unsigned char].
400/ \returns
401/ A 128-bit vector of [4 x int].
402/
403/ \code{.operation}
404/ FOR j := 0 to 3
405/ tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
406/ tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
407/ tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
408/ tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
409/ dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
410/ ENDFOR
411/ dst[MAX:128] := 0
412/ \endcode
413/ clang-format on
414#define _mm_dpbuuds_epi32(__W, __A, __B) \
415 ((__m128i)__builtin_ia32_vpdpbuuds128((__v4si)(__W), (__v16qu)(__A), \
416 (__v16qu)(__B)))
417
418/ clang-format off
419/ corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
420/ signed 16-bit results. Sum these 4 results with the corresponding
421/ 32-bit integer in \a __W with signed saturation, and store the packed
422/ 32-bit results in \a dst.
423/
424/ \headerfile <x86intrin.h>
425/
426/ \code
427/ _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
428/ \endcode
429/
430/ This intrinsic corresponds to the \c VPDPBUUDS instruction.
431/
432/ \param __A
433/ A 256-bit vector of [32 x unsigned char].
434/ \param __B
435/ A 256-bit vector of [32 x unsigned char].
436/ \returns
437/ A 256-bit vector of [8 x int].
438/
439/ \code{.operation}
440/ FOR j := 0 to 7
441/ tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
442/ tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
443/ tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
444/ tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
445/ dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
446/ ENDFOR
447/ dst[MAX:256] := 0
448/ \endcode
449/ clang-format on
450#define _mm256_dpbuuds_epi32(__W, __A, __B) \
451 ((__m256i)__builtin_ia32_vpdpbuuds256((__v8si)(__W), (__v32qu)(__A), \
452 (__v32qu)(__B)))
453
454#endif / __AVXVNNIINT8INTRIN_H

Follow Lee on X/Twitter - Father, Husband, Serial builder creating AI, crypto, games & web tools. We are friends :) AI Will Come To Life!

Check out: eBank.nz (Art Generator) | Netwrck.com (AI Tools) | Text-Generator.io (AI API) | BitBank.nz (Crypto AI) | ReadingTime (Kids Reading) | RewordGame | BigMultiplayerChess | WebFiddle | How.nz | Helix AI Assistant