High-performance implementation of multiplication in finite fields on the Intel instruction set
/* multiplication in galois field with reduction */
#ifdef __x86_64__
__attribute__((target("sse2,pclmul")))
#endif
inline void gfmul (__m128i a, __m128i b, __m128i *res){
__m128i tmp3, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
__m128i XMMMASK = _mm_setr_epi32(0xffffffff, 0x0, 0x0, 0x0);
mul128(a, b, &tmp3, &tmp6);
tmp7 = _mm_srli_epi32(tmp6, 31);
tmp8 = _mm_srli_epi32(tmp6, 30);
tmp9 = _mm_srli_epi32(tmp6, 25);
tmp7 = _mm_xor_si128(tmp7, tmp8);
tmp7 = _mm_xor_si128(tmp7, tmp9);
tmp8 = _mm_shuffle_epi32(tmp7, 147);
tmp7 = _mm_and_si128(XMMMASK, tmp8);
tmp8 = _mm_andnot_si128(XMMMASK, tmp8);
tmp3 = _mm_xor_si128(tmp3, tmp8);
tmp6 = _mm_xor_si128(tmp6, tmp7);
tmp10 = _mm_slli_epi32(tmp6, 1);
tmp3 = _mm_xor_si128(tmp3, tmp10);
tmp11 = _mm_slli_epi32(tmp6, 2);
tmp3 = _mm_xor_si128(tmp3, tmp11);
tmp12 = _mm_slli_epi32(tmp6, 7);
tmp3 = _mm_xor_si128(tmp3, tmp12);
*res = _mm_xor_si128(tmp3, tmp6);
}
Prerequisite
- Operations in finite fields of modulo 128-degree polynomials
- Intel SSE instruction set
- Basic usage of Sagemath
The irreducible polynomial here is
mul128
First, we discuss the multiplication of two 128-bit polynomials without reduction. For convenience, we convert polynomials into their corresponding binary numbers. For example, 0b11011
, which is
Intel has an instruction specifically for multiplication in __m128i _mm_clmulepi64_si128(__m128i a, __m128i b, const int imm8);
. Its syntax is as follows:
__m128i a
: A source operand containing two 64-bit integers.__m128i b
: A source operand containing two 64-bit integers.const int imm8
: An immediate value specifying which 64-bit integer pairs to use. This is an 8-bit constant:- If the least significant bit of
imm8
is 0, select the lower 64 bits ofa
; otherwise, select the upper 64 bits ofa
. - If the 4th bit of
imm8
is 0, select the lower 64 bits ofb
; otherwise, select the upper 64 bits ofb
.
- If the least significant bit of
We can implement 128-bit carry-less multiplication using 64-bit carry-less multiplication with the following steps:
/* multiplication in galois field without reduction */
#ifdef __x86_64__
__attribute__((target("sse2,pclmul")))
inline void mul128(__m128i a, __m128i b, __m128i *res1, __m128i *res2) {
__m128i tmp3, tmp4, tmp5, tmp6;
tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
tmp4 = _mm_xor_si128(tmp4, tmp5);
tmp5 = _mm_slli_si128(tmp4, 8);
tmp4 = _mm_srli_si128(tmp4, 8);
tmp3 = _mm_xor_si128(tmp3, tmp5);
tmp6 = _mm_xor_si128(tmp6, tmp4);
// initial mul now in tmp3, tmp6
*res1 = tmp3;
*res2 = tmp6;
}
Below is the proof of this algorithm:
Proof
Let the high and low parts of the multiplicand
In the second half, we have:
Therefore:
And:
Hence,
gfmul
In practice, we also need to reduce the result after non-carry multiplication modulo
# Define the polynomial ring over GF(2)
P.<x> = PolynomialRing(GF(2))
# Define the modulus polynomial
mod_poly = x^128 + x^7 + x^2 + x + 1
# Define the finite field GF(2^128)
K.<a> = GF(2**128, modulus=mod_poly)
# Convert an integer to a polynomial
def int_to_poly(n):
bin_str = bin(n)[2:] # Convert integer to binary string, remove '0b' prefix
return sum([int(bit) * x^i for i, bit in enumerate(reversed(bin_str))])
# Convert a polynomial back to an integer
def poly_to_int(p):
return sum([int(coeff) * (2**i) for i, coeff in enumerate(p.list())])
# Example: Elements represented as integers
f_int, g_int = 98195696920426533817649554218743231661, 43027262476631949179376797970948942433
# Convert to polynomials
f_poly = int_to_poly(f_int)
g_poly = int_to_poly(g_int)
# Perform operations in GF(2^128)
f = K(f_poly)
g = K(g_poly)
# Example operations
add_result = f + g
mul_result = f * g
# Convert results back to integers
add_result_int = poly_to_int(add_result.polynomial())
mul_result_int = poly_to_int(mul_result.polynomial())
print("f (as int) =", f_int)
print("g (as int) =", g_int)
print("f + g (as int) =", add_result_int)
print("f * g (as int) =", mul_result_int)
'''
f (as int) = 98195696920426533817649554218743231661
g (as int) = 43027262476631949179376797970948942433
f + g (as int) = 140241067422779931769655503331313065676
f * g (as int) = 30853704161780158484268560045100192027
'''
Once we have a proper validation program, we can analyze and debug the code provided at the beginning of the article.
Proof
Let us first analyze how to represent the result of
tmp3, tmp6 = mul128(a, b)
// Assume T = ((tmp6 >> 127) ^ (tmp6 >> 126) ^ (tmp6 >> 121))
// Here, nr() denotes "not reduced"
res = nr(tmp3 ^ ((tmp6) << 128))
= tmp3 ^ nr(tmp6 << 128)
= tmp3 ^ nr(tmp6) ^ nr(tmp6 << 1) ^ nr(tmp6 << 2) ^ nr(tmp6 << 7)
= tmp3 ^ nr(tmp6) ^ (nr(tmp6) << 1) ^ (nr(tmp6) << 2) ^ (nr(tmp6) << 7)
// Since tmp6 >> 128 is definitely 0, it is omitted here
= tmp3 ^ (tmp6 ^ T)
^ ((tmp6 ^ T) << 1)
^ ((tmp6 ^ T) << 2)
^ ((tmp6 ^ T) << 7)
However, for computational convenience, we take T = ((tmp6 >> 31) ^ (tmp6 >> 30) ^ (tmp6 >> 25))
. Therefore, we have:
def gfmul(a, b):
tmp3, tmp6 = mul128(a, b)
T = ((tmp6 >> 31) ^ (tmp6 >> 30) ^ (tmp6 >> 25))
res = tmp3 ^ \
(T >> 96) ^ tmp6 ^ \
(((T >> 96) ^ tmp6) << 1) ^ \
(((T >> 96) ^ tmp6) << 2) ^ \
(((T >> 96) ^ tmp6) << 7)
return res % (2 ** 128 - 1)
After comparison with sagemath, their outputs are completely consistent.
Instruction Set Implementation
However, in 128-bit numbers, there is no direct operation to shift a 128-bit number right or left. Using C language's <<
or >>
operators will only shift the four 32-bit vectors that make up __m128i
together, thus failing to achieve the correct result.
In fact, these two operators correspond to the _mm_slli_epi32
and _mm_srli_epi32
instructions. The syntax of the former is as follows, and the latter is similar:
__m128i _mm_slli_epi32(__m128i a, int imm8);
The
_mm_slli_epi32
instruction performs a logical left shift on each 32-bit integer in registera
, with the number of bits to shift specified by the immediate valueimm8
. All integers are shifted left by the same number of bits, with the high bits shifted out being discarded and the low bits filled with zeros.
Through relevant derivations, for any 128-bit number
x << i === mm_slli_epi32(x, i) ^ (mm_srli_epi32(x, 32-i) << 32)
The proof is relatively straightforward and is omitted here.
Therefore, we can modify the original function:
def gfmul(a, b):
tmp3, tmp6 = mul128(a, b)
T = ((tmp6 >> 31) ^ (tmp6 >> 30) ^ (tmp6 >> 25))
# T = (srli(tmp6, 31) ^ srli(tmp6, 30) ^ srli(tmp6, 25))
res = tmp3 ^ \
(T >> 96) ^ tmp6 ^ \
(((T >> 96) ^ tmp6) << 1) ^ \
(((T >> 96) ^ tmp6) << 2) ^ \
(((T >> 96) ^ tmp6) << 7)
# res = tmp3 ^ tmp6 ^ \
# = slli(tmp6, 1) ^ (srli(tmp6, 31) << 32) ^ \
# slli(tmp6, 2) ^ (srli(tmp6, 30) << 32) ^ \
# slli(tmp6, 7) ^ (srli(tmp6, 25) << 32) ^ \
# (T >> 96) ^ (T >> 96 << 1) ^ (T >> 96 << 2) ^ (T >> 96 << 7)
# Here, since (T >> 96 << 7) itself has at most 14 bits, which is less than 32 bits, the transformation mentioned above is not needed.
# = (T >> 96) ^ tmp6 \
# slli(tmp6 ^ (T >> 96), 1) \
# slli(tmp6 ^ (T >> 96), 2) \
# slli(tmp6 ^ (T >> 96), 7) \
# (T << 32) ^ tmp3
return res & (2 ** 128 - 1)
Now we can return to the source code mentioned at the beginning of this article:
void gfmul (__m128i a, __m128i b, __m128i *res){
__m128i tmp3, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
__m128i XMMMASK = _mm_setr_epi32(0xffffffff, 0x0, 0x0, 0x0);
mul128(a, b, &tmp3, &tmp6);
tmp7 = _mm_srli_epi32(tmp6, 31);
tmp8 = _mm_srli_epi32(tmp6, 30);
tmp9 = _mm_srli_epi32(tmp6, 25);
tmp7 = _mm_xor_si128(tmp7, tmp8);
tmp7 = _mm_xor_si128(tmp7, tmp9);
// tmp7 = T = (srli(tmp6, 31) ^ srli(tmp6, 30) ^ srli(tmp6, 25))
tmp8 = _mm_shuffle_epi32(tmp7, 147);
tmp7 = _mm_and_si128(XMMMASK, tmp8);
// tmp7 = T >> 96
tmp8 = _mm_andnot_si128(XMMMASK, tmp8);
// tmp8 = T << 32
tmp3 = _mm_xor_si128(tmp3, tmp8);
// tmp3 = ((T << 32) ^ tmp3)
tmp6 = _mm_xor_si128(tmp6, tmp7);
// tmp6 = ((T >> 96) ^ tmp6)
tmp10 = _mm_slli_epi32(tmp6, 1);
tmp3 = _mm_xor_si128(tmp3, tmp10);
tmp11 = _mm_slli_epi32(tmp6, 2);
tmp3 = _mm_xor_si128(tmp3, tmp11);
tmp12 = _mm_slli_epi32(tmp6, 7);
tmp3 = _mm_xor_si128(tmp3, tmp12);
*res = _mm_xor_si128(tmp3, tmp6);
// res = ((T << 32) ^ tmp3) ^ \
slli((T >> 96) ^ tmp6), 1) ^ \
slli((T >> 96) ^ tmp6), 2) ^ \
slli((T >> 96) ^ tmp6), 7) ^ \
((T >> 96) ^ tmp6)
}
Thus, we have achieved the goal of performing multiplication in