summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAdam Langley <agl@google.com>2015-08-26 13:48:40 -0700
committerAdam Langley <agl@google.com>2015-08-26 15:09:04 -0700
commit95b10c44db8faa0e3a95c76440b2b0d4665eb7a8 (patch)
tree4ebb615126519f0a4bcdd3ba848976cd6c37d9d3
parent92dd82d23b0a37d6c3ae50f114dc22ce055ce1f3 (diff)
downloaded25519-95b10c44db8faa0e3a95c76440b2b0d4665eb7a8.tar.xz
Simplify and optimise some operations
-rw-r--r--edwards25519/edwards25519.go798
1 files changed, 268 insertions, 530 deletions
diff --git a/edwards25519/edwards25519.go b/edwards25519/edwards25519.go
index af2983f..a655825 100644
--- a/edwards25519/edwards25519.go
+++ b/edwards25519/edwards25519.go
@@ -16,17 +16,10 @@ package edwards25519
// context.
type FieldElement [10]int32
+var zero FieldElement
+
func FeZero(fe *FieldElement) {
- fe[0] = 0
- fe[1] = 0
- fe[2] = 0
- fe[3] = 0
- fe[4] = 0
- fe[5] = 0
- fe[6] = 0
- fe[7] = 0
- fe[8] = 0
- fe[9] = 0
+ copy(fe[:], zero[:])
}
func FeOne(fe *FieldElement) {
@@ -111,49 +104,7 @@ func FeFromBytes(dst *FieldElement, src *[32]byte) {
h8 := load3(src[26:]) << 4
h9 := (load3(src[29:]) & 8388607) << 2
- var carry [10]int64
- carry[9] = (h9 + 1<<24) >> 25
- h0 += carry[9] * 19
- h9 -= carry[9] << 25
- carry[1] = (h1 + 1<<24) >> 25
- h2 += carry[1]
- h1 -= carry[1] << 25
- carry[3] = (h3 + 1<<24) >> 25
- h4 += carry[3]
- h3 -= carry[3] << 25
- carry[5] = (h5 + 1<<24) >> 25
- h6 += carry[5]
- h5 -= carry[5] << 25
- carry[7] = (h7 + 1<<24) >> 25
- h8 += carry[7]
- h7 -= carry[7] << 25
-
- carry[0] = (h0 + 1<<25) >> 26
- h1 += carry[0]
- h0 -= carry[0] << 26
- carry[2] = (h2 + 1<<25) >> 26
- h3 += carry[2]
- h2 -= carry[2] << 26
- carry[4] = (h4 + 1<<25) >> 26
- h5 += carry[4]
- h4 -= carry[4] << 26
- carry[6] = (h6 + 1<<25) >> 26
- h7 += carry[6]
- h6 -= carry[6] << 26
- carry[8] = (h8 + 1<<25) >> 26
- h9 += carry[8]
- h8 -= carry[8] << 26
-
- dst[0] = int32(h0)
- dst[1] = int32(h1)
- dst[2] = int32(h2)
- dst[3] = int32(h3)
- dst[4] = int32(h4)
- dst[5] = int32(h5)
- dst[6] = int32(h6)
- dst[7] = int32(h7)
- dst[8] = int32(h8)
- dst[9] = int32(h9)
+ FeCombine(dst, h0, h1, h2, h3, h4, h5, h6, h7, h8, h9)
}
// FeToBytes marshals h to s.
@@ -307,180 +258,9 @@ func FeNeg(h, f *FieldElement) {
h[9] = -f[9]
}
-// FeMul calculates h = f * g
-// Can overlap h with f or g.
-//
-// Preconditions:
-// |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
-// |g| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
-//
-// Postconditions:
-// |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
-//
-// Notes on implementation strategy:
-//
-// Using schoolbook multiplication.
-// Karatsuba would save a little in some cost models.
-//
-// Most multiplications by 2 and 19 are 32-bit precomputations;
-// cheaper than 64-bit postcomputations.
-//
-// There is one remaining multiplication by 19 in the carry chain;
-// one *19 precomputation can be merged into this,
-// but the resulting data flow is considerably less clean.
-//
-// There are 12 carries below.
-// 10 of them are 2-way parallelizable and vectorizable.
-// Can get away with 11 carries, but then data flow is much deeper.
-//
-// With tighter constraints on inputs can squeeze carries into int32.
-func FeMul(h, f, g *FieldElement) {
- f0 := f[0]
- f1 := f[1]
- f2 := f[2]
- f3 := f[3]
- f4 := f[4]
- f5 := f[5]
- f6 := f[6]
- f7 := f[7]
- f8 := f[8]
- f9 := f[9]
- g0 := g[0]
- g1 := g[1]
- g2 := g[2]
- g3 := g[3]
- g4 := g[4]
- g5 := g[5]
- g6 := g[6]
- g7 := g[7]
- g8 := g[8]
- g9 := g[9]
- g1_19 := 19 * g1 /* 1.4*2^29 */
- g2_19 := 19 * g2 /* 1.4*2^30; still ok */
- g3_19 := 19 * g3
- g4_19 := 19 * g4
- g5_19 := 19 * g5
- g6_19 := 19 * g6
- g7_19 := 19 * g7
- g8_19 := 19 * g8
- g9_19 := 19 * g9
- f1_2 := 2 * f1
- f3_2 := 2 * f3
- f5_2 := 2 * f5
- f7_2 := 2 * f7
- f9_2 := 2 * f9
- f0g0 := int64(f0) * int64(g0)
- f0g1 := int64(f0) * int64(g1)
- f0g2 := int64(f0) * int64(g2)
- f0g3 := int64(f0) * int64(g3)
- f0g4 := int64(f0) * int64(g4)
- f0g5 := int64(f0) * int64(g5)
- f0g6 := int64(f0) * int64(g6)
- f0g7 := int64(f0) * int64(g7)
- f0g8 := int64(f0) * int64(g8)
- f0g9 := int64(f0) * int64(g9)
- f1g0 := int64(f1) * int64(g0)
- f1g1_2 := int64(f1_2) * int64(g1)
- f1g2 := int64(f1) * int64(g2)
- f1g3_2 := int64(f1_2) * int64(g3)
- f1g4 := int64(f1) * int64(g4)
- f1g5_2 := int64(f1_2) * int64(g5)
- f1g6 := int64(f1) * int64(g6)
- f1g7_2 := int64(f1_2) * int64(g7)
- f1g8 := int64(f1) * int64(g8)
- f1g9_38 := int64(f1_2) * int64(g9_19)
- f2g0 := int64(f2) * int64(g0)
- f2g1 := int64(f2) * int64(g1)
- f2g2 := int64(f2) * int64(g2)
- f2g3 := int64(f2) * int64(g3)
- f2g4 := int64(f2) * int64(g4)
- f2g5 := int64(f2) * int64(g5)
- f2g6 := int64(f2) * int64(g6)
- f2g7 := int64(f2) * int64(g7)
- f2g8_19 := int64(f2) * int64(g8_19)
- f2g9_19 := int64(f2) * int64(g9_19)
- f3g0 := int64(f3) * int64(g0)
- f3g1_2 := int64(f3_2) * int64(g1)
- f3g2 := int64(f3) * int64(g2)
- f3g3_2 := int64(f3_2) * int64(g3)
- f3g4 := int64(f3) * int64(g4)
- f3g5_2 := int64(f3_2) * int64(g5)
- f3g6 := int64(f3) * int64(g6)
- f3g7_38 := int64(f3_2) * int64(g7_19)
- f3g8_19 := int64(f3) * int64(g8_19)
- f3g9_38 := int64(f3_2) * int64(g9_19)
- f4g0 := int64(f4) * int64(g0)
- f4g1 := int64(f4) * int64(g1)
- f4g2 := int64(f4) * int64(g2)
- f4g3 := int64(f4) * int64(g3)
- f4g4 := int64(f4) * int64(g4)
- f4g5 := int64(f4) * int64(g5)
- f4g6_19 := int64(f4) * int64(g6_19)
- f4g7_19 := int64(f4) * int64(g7_19)
- f4g8_19 := int64(f4) * int64(g8_19)
- f4g9_19 := int64(f4) * int64(g9_19)
- f5g0 := int64(f5) * int64(g0)
- f5g1_2 := int64(f5_2) * int64(g1)
- f5g2 := int64(f5) * int64(g2)
- f5g3_2 := int64(f5_2) * int64(g3)
- f5g4 := int64(f5) * int64(g4)
- f5g5_38 := int64(f5_2) * int64(g5_19)
- f5g6_19 := int64(f5) * int64(g6_19)
- f5g7_38 := int64(f5_2) * int64(g7_19)
- f5g8_19 := int64(f5) * int64(g8_19)
- f5g9_38 := int64(f5_2) * int64(g9_19)
- f6g0 := int64(f6) * int64(g0)
- f6g1 := int64(f6) * int64(g1)
- f6g2 := int64(f6) * int64(g2)
- f6g3 := int64(f6) * int64(g3)
- f6g4_19 := int64(f6) * int64(g4_19)
- f6g5_19 := int64(f6) * int64(g5_19)
- f6g6_19 := int64(f6) * int64(g6_19)
- f6g7_19 := int64(f6) * int64(g7_19)
- f6g8_19 := int64(f6) * int64(g8_19)
- f6g9_19 := int64(f6) * int64(g9_19)
- f7g0 := int64(f7) * int64(g0)
- f7g1_2 := int64(f7_2) * int64(g1)
- f7g2 := int64(f7) * int64(g2)
- f7g3_38 := int64(f7_2) * int64(g3_19)
- f7g4_19 := int64(f7) * int64(g4_19)
- f7g5_38 := int64(f7_2) * int64(g5_19)
- f7g6_19 := int64(f7) * int64(g6_19)
- f7g7_38 := int64(f7_2) * int64(g7_19)
- f7g8_19 := int64(f7) * int64(g8_19)
- f7g9_38 := int64(f7_2) * int64(g9_19)
- f8g0 := int64(f8) * int64(g0)
- f8g1 := int64(f8) * int64(g1)
- f8g2_19 := int64(f8) * int64(g2_19)
- f8g3_19 := int64(f8) * int64(g3_19)
- f8g4_19 := int64(f8) * int64(g4_19)
- f8g5_19 := int64(f8) * int64(g5_19)
- f8g6_19 := int64(f8) * int64(g6_19)
- f8g7_19 := int64(f8) * int64(g7_19)
- f8g8_19 := int64(f8) * int64(g8_19)
- f8g9_19 := int64(f8) * int64(g9_19)
- f9g0 := int64(f9) * int64(g0)
- f9g1_38 := int64(f9_2) * int64(g1_19)
- f9g2_19 := int64(f9) * int64(g2_19)
- f9g3_38 := int64(f9_2) * int64(g3_19)
- f9g4_19 := int64(f9) * int64(g4_19)
- f9g5_38 := int64(f9_2) * int64(g5_19)
- f9g6_19 := int64(f9) * int64(g6_19)
- f9g7_38 := int64(f9_2) * int64(g7_19)
- f9g8_19 := int64(f9) * int64(g8_19)
- f9g9_38 := int64(f9_2) * int64(g9_19)
- h0 := f0g0 + f1g9_38 + f2g8_19 + f3g7_38 + f4g6_19 + f5g5_38 + f6g4_19 + f7g3_38 + f8g2_19 + f9g1_38
- h1 := f0g1 + f1g0 + f2g9_19 + f3g8_19 + f4g7_19 + f5g6_19 + f6g5_19 + f7g4_19 + f8g3_19 + f9g2_19
- h2 := f0g2 + f1g1_2 + f2g0 + f3g9_38 + f4g8_19 + f5g7_38 + f6g6_19 + f7g5_38 + f8g4_19 + f9g3_38
- h3 := f0g3 + f1g2 + f2g1 + f3g0 + f4g9_19 + f5g8_19 + f6g7_19 + f7g6_19 + f8g5_19 + f9g4_19
- h4 := f0g4 + f1g3_2 + f2g2 + f3g1_2 + f4g0 + f5g9_38 + f6g8_19 + f7g7_38 + f8g6_19 + f9g5_38
- h5 := f0g5 + f1g4 + f2g3 + f3g2 + f4g1 + f5g0 + f6g9_19 + f7g8_19 + f8g7_19 + f9g6_19
- h6 := f0g6 + f1g5_2 + f2g4 + f3g3_2 + f4g2 + f5g1_2 + f6g0 + f7g9_38 + f8g8_19 + f9g7_38
- h7 := f0g7 + f1g6 + f2g5 + f3g4 + f4g3 + f5g2 + f6g1 + f7g0 + f8g9_19 + f9g8_19
- h8 := f0g8 + f1g7_2 + f2g6 + f3g5_2 + f4g4 + f5g3_2 + f6g2 + f7g1_2 + f8g0 + f9g9_38
- h9 := f0g9 + f1g8 + f2g7 + f3g6 + f4g5 + f5g4 + f6g3 + f7g2 + f8g1 + f9g0
- var carry [10]int64
+func FeCombine(h *FieldElement, h0, h1, h2, h3, h4, h5, h6, h7, h8, h9 int64) {
+ var c0, c1, c2, c3, c4, c5, c6, c7, c8, c9 int64
/*
|h0| <= (1.1*1.1*2^52*(1+19+19+19+19)+1.1*1.1*2^50*(38+38+38+38+38))
i.e. |h0| <= 1.2*2^59; narrower ranges for h2, h4, h6, h8
@@ -488,70 +268,70 @@ func FeMul(h, f, g *FieldElement) {
i.e. |h1| <= 1.5*2^58; narrower ranges for h3, h5, h7, h9
*/
- carry[0] = (h0 + (1 << 25)) >> 26
- h1 += carry[0]
- h0 -= carry[0] << 26
- carry[4] = (h4 + (1 << 25)) >> 26
- h5 += carry[4]
- h4 -= carry[4] << 26
+ c0 = (h0 + (1 << 25)) >> 26
+ h1 += c0
+ h0 -= c0 << 26
+ c4 = (h4 + (1 << 25)) >> 26
+ h5 += c4
+ h4 -= c4 << 26
/* |h0| <= 2^25 */
/* |h4| <= 2^25 */
/* |h1| <= 1.51*2^58 */
/* |h5| <= 1.51*2^58 */
- carry[1] = (h1 + (1 << 24)) >> 25
- h2 += carry[1]
- h1 -= carry[1] << 25
- carry[5] = (h5 + (1 << 24)) >> 25
- h6 += carry[5]
- h5 -= carry[5] << 25
+ c1 = (h1 + (1 << 24)) >> 25
+ h2 += c1
+ h1 -= c1 << 25
+ c5 = (h5 + (1 << 24)) >> 25
+ h6 += c5
+ h5 -= c5 << 25
/* |h1| <= 2^24; from now on fits into int32 */
/* |h5| <= 2^24; from now on fits into int32 */
/* |h2| <= 1.21*2^59 */
/* |h6| <= 1.21*2^59 */
- carry[2] = (h2 + (1 << 25)) >> 26
- h3 += carry[2]
- h2 -= carry[2] << 26
- carry[6] = (h6 + (1 << 25)) >> 26
- h7 += carry[6]
- h6 -= carry[6] << 26
+ c2 = (h2 + (1 << 25)) >> 26
+ h3 += c2
+ h2 -= c2 << 26
+ c6 = (h6 + (1 << 25)) >> 26
+ h7 += c6
+ h6 -= c6 << 26
/* |h2| <= 2^25; from now on fits into int32 unchanged */
/* |h6| <= 2^25; from now on fits into int32 unchanged */
/* |h3| <= 1.51*2^58 */
/* |h7| <= 1.51*2^58 */
- carry[3] = (h3 + (1 << 24)) >> 25
- h4 += carry[3]
- h3 -= carry[3] << 25
- carry[7] = (h7 + (1 << 24)) >> 25
- h8 += carry[7]
- h7 -= carry[7] << 25
+ c3 = (h3 + (1 << 24)) >> 25
+ h4 += c3
+ h3 -= c3 << 25
+ c7 = (h7 + (1 << 24)) >> 25
+ h8 += c7
+ h7 -= c7 << 25
/* |h3| <= 2^24; from now on fits into int32 unchanged */
/* |h7| <= 2^24; from now on fits into int32 unchanged */
/* |h4| <= 1.52*2^33 */
/* |h8| <= 1.52*2^33 */
- carry[4] = (h4 + (1 << 25)) >> 26
- h5 += carry[4]
- h4 -= carry[4] << 26
- carry[8] = (h8 + (1 << 25)) >> 26
- h9 += carry[8]
- h8 -= carry[8] << 26
+ c4 = (h4 + (1 << 25)) >> 26
+ h5 += c4
+ h4 -= c4 << 26
+ c8 = (h8 + (1 << 25)) >> 26
+ h9 += c8
+ h8 -= c8 << 26
/* |h4| <= 2^25; from now on fits into int32 unchanged */
/* |h8| <= 2^25; from now on fits into int32 unchanged */
/* |h5| <= 1.01*2^24 */
/* |h9| <= 1.51*2^58 */
- carry[9] = (h9 + (1 << 24)) >> 25
- h0 += carry[9] * 19
- h9 -= carry[9] << 25
+ c9 = (h9 + (1 << 24)) >> 25
+ h0 += c9 * 19
+ h9 -= c9 << 25
/* |h9| <= 2^24; from now on fits into int32 unchanged */
/* |h0| <= 1.8*2^37 */
- carry[0] = (h0 + (1 << 25)) >> 26
- h1 += carry[0]
- h0 -= carry[0] << 26
+ c0 = (h0 + (1 << 25)) >> 26
+ h1 += c0
+ h0 -= c0 << 26
/* |h0| <= 2^25; from now on fits into int32 unchanged */
/* |h1| <= 1.01*2^24 */
@@ -567,157 +347,255 @@ func FeMul(h, f, g *FieldElement) {
h[9] = int32(h9)
}
-// FeSquare calculates h = f*f. Can overlap h with f.
+// FeMul calculates h = f * g
+// Can overlap h with f or g.
//
// Preconditions:
// |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
+// |g| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
//
// Postconditions:
// |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
-func FeSquare(h, f *FieldElement) {
- f0 := f[0]
- f1 := f[1]
- f2 := f[2]
- f3 := f[3]
- f4 := f[4]
- f5 := f[5]
- f6 := f[6]
- f7 := f[7]
- f8 := f[8]
- f9 := f[9]
- f0_2 := 2 * f0
- f1_2 := 2 * f1
- f2_2 := 2 * f2
- f3_2 := 2 * f3
- f4_2 := 2 * f4
- f5_2 := 2 * f5
- f6_2 := 2 * f6
- f7_2 := 2 * f7
- f5_38 := 38 * f5 // 1.31*2^30
- f6_19 := 19 * f6 // 1.31*2^30
- f7_38 := 38 * f7 // 1.31*2^30
- f8_19 := 19 * f8 // 1.31*2^30
- f9_38 := 38 * f9 // 1.31*2^30
- f0f0 := int64(f0) * int64(f0)
- f0f1_2 := int64(f0_2) * int64(f1)
- f0f2_2 := int64(f0_2) * int64(f2)
- f0f3_2 := int64(f0_2) * int64(f3)
- f0f4_2 := int64(f0_2) * int64(f4)
- f0f5_2 := int64(f0_2) * int64(f5)
- f0f6_2 := int64(f0_2) * int64(f6)
- f0f7_2 := int64(f0_2) * int64(f7)
- f0f8_2 := int64(f0_2) * int64(f8)
- f0f9_2 := int64(f0_2) * int64(f9)
- f1f1_2 := int64(f1_2) * int64(f1)
- f1f2_2 := int64(f1_2) * int64(f2)
+//
+// Notes on implementation strategy:
+//
+// Using schoolbook multiplication.
+// Karatsuba would save a little in some cost models.
+//
+// Most multiplications by 2 and 19 are 32-bit precomputations;
+// cheaper than 64-bit postcomputations.
+//
+// There is one remaining multiplication by 19 in the carry chain;
+// one *19 precomputation can be merged into this,
+// but the resulting data flow is considerably less clean.
+//
+// There are 12 carries below.
+// 10 of them are 2-way parallelizable and vectorizable.
+// Can get away with 11 carries, but then data flow is much deeper.
+//
+// With tighter constraints on inputs can squeeze carries into int32.
+func FeMul(h, f, g *FieldElement) {
+ g1_19 := 19 * g[1] /* 1.4*2^29 */
+ g2_19 := 19 * g[2] /* 1.4*2^30; still ok */
+ g3_19 := 19 * g[3]
+ g4_19 := 19 * g[4]
+ g5_19 := 19 * g[5]
+ g6_19 := 19 * g[6]
+ g7_19 := 19 * g[7]
+ g8_19 := 19 * g[8]
+ g9_19 := 19 * g[9]
+ f1_2 := 2 * f[1]
+ f3_2 := 2 * f[3]
+ f5_2 := 2 * f[5]
+ f7_2 := 2 * f[7]
+ f9_2 := 2 * f[9]
+ f0g0 := int64(f[0]) * int64(g[0])
+ f0g1 := int64(f[0]) * int64(g[1])
+ f0g2 := int64(f[0]) * int64(g[2])
+ f0g3 := int64(f[0]) * int64(g[3])
+ f0g4 := int64(f[0]) * int64(g[4])
+ f0g5 := int64(f[0]) * int64(g[5])
+ f0g6 := int64(f[0]) * int64(g[6])
+ f0g7 := int64(f[0]) * int64(g[7])
+ f0g8 := int64(f[0]) * int64(g[8])
+ f0g9 := int64(f[0]) * int64(g[9])
+ f1g0 := int64(f[1]) * int64(g[0])
+ f1g1_2 := int64(f1_2) * int64(g[1])
+ f1g2 := int64(f[1]) * int64(g[2])
+ f1g3_2 := int64(f1_2) * int64(g[3])
+ f1g4 := int64(f[1]) * int64(g[4])
+ f1g5_2 := int64(f1_2) * int64(g[5])
+ f1g6 := int64(f[1]) * int64(g[6])
+ f1g7_2 := int64(f1_2) * int64(g[7])
+ f1g8 := int64(f[1]) * int64(g[8])
+ f1g9_38 := int64(f1_2) * int64(g9_19)
+ f2g0 := int64(f[2]) * int64(g[0])
+ f2g1 := int64(f[2]) * int64(g[1])
+ f2g2 := int64(f[2]) * int64(g[2])
+ f2g3 := int64(f[2]) * int64(g[3])
+ f2g4 := int64(f[2]) * int64(g[4])
+ f2g5 := int64(f[2]) * int64(g[5])
+ f2g6 := int64(f[2]) * int64(g[6])
+ f2g7 := int64(f[2]) * int64(g[7])
+ f2g8_19 := int64(f[2]) * int64(g8_19)
+ f2g9_19 := int64(f[2]) * int64(g9_19)
+ f3g0 := int64(f[3]) * int64(g[0])
+ f3g1_2 := int64(f3_2) * int64(g[1])
+ f3g2 := int64(f[3]) * int64(g[2])
+ f3g3_2 := int64(f3_2) * int64(g[3])
+ f3g4 := int64(f[3]) * int64(g[4])
+ f3g5_2 := int64(f3_2) * int64(g[5])
+ f3g6 := int64(f[3]) * int64(g[6])
+ f3g7_38 := int64(f3_2) * int64(g7_19)
+ f3g8_19 := int64(f[3]) * int64(g8_19)
+ f3g9_38 := int64(f3_2) * int64(g9_19)
+ f4g0 := int64(f[4]) * int64(g[0])
+ f4g1 := int64(f[4]) * int64(g[1])
+ f4g2 := int64(f[4]) * int64(g[2])
+ f4g3 := int64(f[4]) * int64(g[3])
+ f4g4 := int64(f[4]) * int64(g[4])
+ f4g5 := int64(f[4]) * int64(g[5])
+ f4g6_19 := int64(f[4]) * int64(g6_19)
+ f4g7_19 := int64(f[4]) * int64(g7_19)
+ f4g8_19 := int64(f[4]) * int64(g8_19)
+ f4g9_19 := int64(f[4]) * int64(g9_19)
+ f5g0 := int64(f[5]) * int64(g[0])
+ f5g1_2 := int64(f5_2) * int64(g[1])
+ f5g2 := int64(f[5]) * int64(g[2])
+ f5g3_2 := int64(f5_2) * int64(g[3])
+ f5g4 := int64(f[5]) * int64(g[4])
+ f5g5_38 := int64(f5_2) * int64(g5_19)
+ f5g6_19 := int64(f[5]) * int64(g6_19)
+ f5g7_38 := int64(f5_2) * int64(g7_19)
+ f5g8_19 := int64(f[5]) * int64(g8_19)
+ f5g9_38 := int64(f5_2) * int64(g9_19)
+ f6g0 := int64(f[6]) * int64(g[0])
+ f6g1 := int64(f[6]) * int64(g[1])
+ f6g2 := int64(f[6]) * int64(g[2])
+ f6g3 := int64(f[6]) * int64(g[3])
+ f6g4_19 := int64(f[6]) * int64(g4_19)
+ f6g5_19 := int64(f[6]) * int64(g5_19)
+ f6g6_19 := int64(f[6]) * int64(g6_19)
+ f6g7_19 := int64(f[6]) * int64(g7_19)
+ f6g8_19 := int64(f[6]) * int64(g8_19)
+ f6g9_19 := int64(f[6]) * int64(g9_19)
+ f7g0 := int64(f[7]) * int64(g[0])
+ f7g1_2 := int64(f7_2) * int64(g[1])
+ f7g2 := int64(f[7]) * int64(g[2])
+ f7g3_38 := int64(f7_2) * int64(g3_19)
+ f7g4_19 := int64(f[7]) * int64(g4_19)
+ f7g5_38 := int64(f7_2) * int64(g5_19)
+ f7g6_19 := int64(f[7]) * int64(g6_19)
+ f7g7_38 := int64(f7_2) * int64(g7_19)
+ f7g8_19 := int64(f[7]) * int64(g8_19)
+ f7g9_38 := int64(f7_2) * int64(g9_19)
+ f8g0 := int64(f[8]) * int64(g[0])
+ f8g1 := int64(f[8]) * int64(g[1])
+ f8g2_19 := int64(f[8]) * int64(g2_19)
+ f8g3_19 := int64(f[8]) * int64(g3_19)
+ f8g4_19 := int64(f[8]) * int64(g4_19)
+ f8g5_19 := int64(f[8]) * int64(g5_19)
+ f8g6_19 := int64(f[8]) * int64(g6_19)
+ f8g7_19 := int64(f[8]) * int64(g7_19)
+ f8g8_19 := int64(f[8]) * int64(g8_19)
+ f8g9_19 := int64(f[8]) * int64(g9_19)
+ f9g0 := int64(f[9]) * int64(g[0])
+ f9g1_38 := int64(f9_2) * int64(g1_19)
+ f9g2_19 := int64(f[9]) * int64(g2_19)
+ f9g3_38 := int64(f9_2) * int64(g3_19)
+ f9g4_19 := int64(f[9]) * int64(g4_19)
+ f9g5_38 := int64(f9_2) * int64(g5_19)
+ f9g6_19 := int64(f[9]) * int64(g6_19)
+ f9g7_38 := int64(f9_2) * int64(g7_19)
+ f9g8_19 := int64(f[9]) * int64(g8_19)
+ f9g9_38 := int64(f9_2) * int64(g9_19)
+
+ h0 := f0g0 + f1g9_38 + f2g8_19 + f3g7_38 + f4g6_19 + f5g5_38 + f6g4_19 + f7g3_38 + f8g2_19 + f9g1_38
+ h1 := f0g1 + f1g0 + f2g9_19 + f3g8_19 + f4g7_19 + f5g6_19 + f6g5_19 + f7g4_19 + f8g3_19 + f9g2_19
+ h2 := f0g2 + f1g1_2 + f2g0 + f3g9_38 + f4g8_19 + f5g7_38 + f6g6_19 + f7g5_38 + f8g4_19 + f9g3_38
+ h3 := f0g3 + f1g2 + f2g1 + f3g0 + f4g9_19 + f5g8_19 + f6g7_19 + f7g6_19 + f8g5_19 + f9g4_19
+ h4 := f0g4 + f1g3_2 + f2g2 + f3g1_2 + f4g0 + f5g9_38 + f6g8_19 + f7g7_38 + f8g6_19 + f9g5_38
+ h5 := f0g5 + f1g4 + f2g3 + f3g2 + f4g1 + f5g0 + f6g9_19 + f7g8_19 + f8g7_19 + f9g6_19
+ h6 := f0g6 + f1g5_2 + f2g4 + f3g3_2 + f4g2 + f5g1_2 + f6g0 + f7g9_38 + f8g8_19 + f9g7_38
+ h7 := f0g7 + f1g6 + f2g5 + f3g4 + f4g3 + f5g2 + f6g1 + f7g0 + f8g9_19 + f9g8_19
+ h8 := f0g8 + f1g7_2 + f2g6 + f3g5_2 + f4g4 + f5g3_2 + f6g2 + f7g1_2 + f8g0 + f9g9_38
+ h9 := f0g9 + f1g8 + f2g7 + f3g6 + f4g5 + f5g4 + f6g3 + f7g2 + f8g1 + f9g0
+
+ FeCombine(h, h0, h1, h2, h3, h4, h5, h6, h7, h8, h9)
+}
+
+func feSquare(f *FieldElement) (h0, h1, h2, h3, h4, h5, h6, h7, h8, h9 int64) {
+ f0_2 := 2 * f[0]
+ f1_2 := 2 * f[1]
+ f2_2 := 2 * f[2]
+ f3_2 := 2 * f[3]
+ f4_2 := 2 * f[4]
+ f5_2 := 2 * f[5]
+ f6_2 := 2 * f[6]
+ f7_2 := 2 * f[7]
+ f5_38 := 38 * f[5] // 1.31*2^30
+ f6_19 := 19 * f[6] // 1.31*2^30
+ f7_38 := 38 * f[7] // 1.31*2^30
+ f8_19 := 19 * f[8] // 1.31*2^30
+ f9_38 := 38 * f[9] // 1.31*2^30
+ f0f0 := int64(f[0]) * int64(f[0])
+ f0f1_2 := int64(f0_2) * int64(f[1])
+ f0f2_2 := int64(f0_2) * int64(f[2])
+ f0f3_2 := int64(f0_2) * int64(f[3])
+ f0f4_2 := int64(f0_2) * int64(f[4])
+ f0f5_2 := int64(f0_2) * int64(f[5])
+ f0f6_2 := int64(f0_2) * int64(f[6])
+ f0f7_2 := int64(f0_2) * int64(f[7])
+ f0f8_2 := int64(f0_2) * int64(f[8])
+ f0f9_2 := int64(f0_2) * int64(f[9])
+ f1f1_2 := int64(f1_2) * int64(f[1])
+ f1f2_2 := int64(f1_2) * int64(f[2])
f1f3_4 := int64(f1_2) * int64(f3_2)
- f1f4_2 := int64(f1_2) * int64(f4)
+ f1f4_2 := int64(f1_2) * int64(f[4])
f1f5_4 := int64(f1_2) * int64(f5_2)
- f1f6_2 := int64(f1_2) * int64(f6)
+ f1f6_2 := int64(f1_2) * int64(f[6])
f1f7_4 := int64(f1_2) * int64(f7_2)
- f1f8_2 := int64(f1_2) * int64(f8)
+ f1f8_2 := int64(f1_2) * int64(f[8])
f1f9_76 := int64(f1_2) * int64(f9_38)
- f2f2 := int64(f2) * int64(f2)
- f2f3_2 := int64(f2_2) * int64(f3)
- f2f4_2 := int64(f2_2) * int64(f4)
- f2f5_2 := int64(f2_2) * int64(f5)
- f2f6_2 := int64(f2_2) * int64(f6)
- f2f7_2 := int64(f2_2) * int64(f7)
+ f2f2 := int64(f[2]) * int64(f[2])
+ f2f3_2 := int64(f2_2) * int64(f[3])
+ f2f4_2 := int64(f2_2) * int64(f[4])
+ f2f5_2 := int64(f2_2) * int64(f[5])
+ f2f6_2 := int64(f2_2) * int64(f[6])
+ f2f7_2 := int64(f2_2) * int64(f[7])
f2f8_38 := int64(f2_2) * int64(f8_19)
- f2f9_38 := int64(f2) * int64(f9_38)
- f3f3_2 := int64(f3_2) * int64(f3)
- f3f4_2 := int64(f3_2) * int64(f4)
+ f2f9_38 := int64(f[2]) * int64(f9_38)
+ f3f3_2 := int64(f3_2) * int64(f[3])
+ f3f4_2 := int64(f3_2) * int64(f[4])
f3f5_4 := int64(f3_2) * int64(f5_2)
- f3f6_2 := int64(f3_2) * int64(f6)
+ f3f6_2 := int64(f3_2) * int64(f[6])
f3f7_76 := int64(f3_2) * int64(f7_38)
f3f8_38 := int64(f3_2) * int64(f8_19)
f3f9_76 := int64(f3_2) * int64(f9_38)
- f4f4 := int64(f4) * int64(f4)
- f4f5_2 := int64(f4_2) * int64(f5)
+ f4f4 := int64(f[4]) * int64(f[4])
+ f4f5_2 := int64(f4_2) * int64(f[5])
f4f6_38 := int64(f4_2) * int64(f6_19)
- f4f7_38 := int64(f4) * int64(f7_38)
+ f4f7_38 := int64(f[4]) * int64(f7_38)
f4f8_38 := int64(f4_2) * int64(f8_19)
- f4f9_38 := int64(f4) * int64(f9_38)
- f5f5_38 := int64(f5) * int64(f5_38)
+ f4f9_38 := int64(f[4]) * int64(f9_38)
+ f5f5_38 := int64(f[5]) * int64(f5_38)
f5f6_38 := int64(f5_2) * int64(f6_19)
f5f7_76 := int64(f5_2) * int64(f7_38)
f5f8_38 := int64(f5_2) * int64(f8_19)
f5f9_76 := int64(f5_2) * int64(f9_38)
- f6f6_19 := int64(f6) * int64(f6_19)
- f6f7_38 := int64(f6) * int64(f7_38)
+ f6f6_19 := int64(f[6]) * int64(f6_19)
+ f6f7_38 := int64(f[6]) * int64(f7_38)
f6f8_38 := int64(f6_2) * int64(f8_19)
- f6f9_38 := int64(f6) * int64(f9_38)
- f7f7_38 := int64(f7) * int64(f7_38)
+ f6f9_38 := int64(f[6]) * int64(f9_38)
+ f7f7_38 := int64(f[7]) * int64(f7_38)
f7f8_38 := int64(f7_2) * int64(f8_19)
f7f9_76 := int64(f7_2) * int64(f9_38)
- f8f8_19 := int64(f8) * int64(f8_19)
- f8f9_38 := int64(f8) * int64(f9_38)
- f9f9_38 := int64(f9) * int64(f9_38)
- h0 := f0f0 + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38
- h1 := f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38
- h2 := f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19
- h3 := f0f3_2 + f1f2_2 + f4f9_38 + f5f8_38 + f6f7_38
- h4 := f0f4_2 + f1f3_4 + f2f2 + f5f9_76 + f6f8_38 + f7f7_38
- h5 := f0f5_2 + f1f4_2 + f2f3_2 + f6f9_38 + f7f8_38
- h6 := f0f6_2 + f1f5_4 + f2f4_2 + f3f3_2 + f7f9_76 + f8f8_19
- h7 := f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38
- h8 := f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4 + f9f9_38
- h9 := f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2
- var carry [10]int64
-
- carry[0] = (h0 + (1 << 25)) >> 26
- h1 += carry[0]
- h0 -= carry[0] << 26
- carry[4] = (h4 + (1 << 25)) >> 26
- h5 += carry[4]
- h4 -= carry[4] << 26
-
- carry[1] = (h1 + (1 << 24)) >> 25
- h2 += carry[1]
- h1 -= carry[1] << 25
- carry[5] = (h5 + (1 << 24)) >> 25
- h6 += carry[5]
- h5 -= carry[5] << 25
-
- carry[2] = (h2 + (1 << 25)) >> 26
- h3 += carry[2]
- h2 -= carry[2] << 26
- carry[6] = (h6 + (1 << 25)) >> 26
- h7 += carry[6]
- h6 -= carry[6] << 26
-
- carry[3] = (h3 + (1 << 24)) >> 25
- h4 += carry[3]
- h3 -= carry[3] << 25
- carry[7] = (h7 + (1 << 24)) >> 25
- h8 += carry[7]
- h7 -= carry[7] << 25
-
- carry[4] = (h4 + (1 << 25)) >> 26
- h5 += carry[4]
- h4 -= carry[4] << 26
- carry[8] = (h8 + (1 << 25)) >> 26
- h9 += carry[8]
- h8 -= carry[8] << 26
-
- carry[9] = (h9 + (1 << 24)) >> 25
- h0 += carry[9] * 19
- h9 -= carry[9] << 25
-
- carry[0] = (h0 + (1 << 25)) >> 26
- h1 += carry[0]
- h0 -= carry[0] << 26
+ f8f8_19 := int64(f[8]) * int64(f8_19)
+ f8f9_38 := int64(f[8]) * int64(f9_38)
+ f9f9_38 := int64(f[9]) * int64(f9_38)
+ h0 = f0f0 + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38
+ h1 = f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38
+ h2 = f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19
+ h3 = f0f3_2 + f1f2_2 + f4f9_38 + f5f8_38 + f6f7_38
+ h4 = f0f4_2 + f1f3_4 + f2f2 + f5f9_76 + f6f8_38 + f7f7_38
+ h5 = f0f5_2 + f1f4_2 + f2f3_2 + f6f9_38 + f7f8_38
+ h6 = f0f6_2 + f1f5_4 + f2f4_2 + f3f3_2 + f7f9_76 + f8f8_19
+ h7 = f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38
+ h8 = f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4 + f9f9_38
+ h9 = f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2
+ return
+}
- h[0] = int32(h0)
- h[1] = int32(h1)
- h[2] = int32(h2)
- h[3] = int32(h3)
- h[4] = int32(h4)
- h[5] = int32(h5)
- h[6] = int32(h6)
- h[7] = int32(h7)
- h[8] = int32(h8)
- h[9] = int32(h9)
+// FeSquare calculates h = f*f. Can overlap h with f.
+//
+// Preconditions:
+// |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
+//
+// Postconditions:
+// |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
+func FeSquare(h, f *FieldElement) {
+ h0, h1, h2, h3, h4, h5, h6, h7, h8, h9 := feSquare(f)
+ FeCombine(h, h0, h1, h2, h3, h4, h5, h6, h7, h8, h9)
}
// FeSquare2 sets h = 2 * f * f
@@ -731,95 +609,7 @@ func FeSquare(h, f *FieldElement) {
// |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
// See fe_mul.c for discussion of implementation strategy.
func FeSquare2(h, f *FieldElement) {
- f0 := f[0]
- f1 := f[1]
- f2 := f[2]
- f3 := f[3]
- f4 := f[4]
- f5 := f[5]
- f6 := f[6]
- f7 := f[7]
- f8 := f[8]
- f9 := f[9]
- f0_2 := 2 * f0
- f1_2 := 2 * f1
- f2_2 := 2 * f2
- f3_2 := 2 * f3
- f4_2 := 2 * f4
- f5_2 := 2 * f5
- f6_2 := 2 * f6
- f7_2 := 2 * f7
- f5_38 := 38 * f5 // 1.959375*2^30
- f6_19 := 19 * f6 // 1.959375*2^30
- f7_38 := 38 * f7 // 1.959375*2^30
- f8_19 := 19 * f8 // 1.959375*2^30
- f9_38 := 38 * f9 // 1.959375*2^30
- f0f0 := int64(f0) * int64(f0)
- f0f1_2 := int64(f0_2) * int64(f1)
- f0f2_2 := int64(f0_2) * int64(f2)
- f0f3_2 := int64(f0_2) * int64(f3)
- f0f4_2 := int64(f0_2) * int64(f4)
- f0f5_2 := int64(f0_2) * int64(f5)
- f0f6_2 := int64(f0_2) * int64(f6)
- f0f7_2 := int64(f0_2) * int64(f7)
- f0f8_2 := int64(f0_2) * int64(f8)
- f0f9_2 := int64(f0_2) * int64(f9)
- f1f1_2 := int64(f1_2) * int64(f1)
- f1f2_2 := int64(f1_2) * int64(f2)
- f1f3_4 := int64(f1_2) * int64(f3_2)
- f1f4_2 := int64(f1_2) * int64(f4)
- f1f5_4 := int64(f1_2) * int64(f5_2)
- f1f6_2 := int64(f1_2) * int64(f6)
- f1f7_4 := int64(f1_2) * int64(f7_2)
- f1f8_2 := int64(f1_2) * int64(f8)
- f1f9_76 := int64(f1_2) * int64(f9_38)
- f2f2 := int64(f2) * int64(f2)
- f2f3_2 := int64(f2_2) * int64(f3)
- f2f4_2 := int64(f2_2) * int64(f4)
- f2f5_2 := int64(f2_2) * int64(f5)
- f2f6_2 := int64(f2_2) * int64(f6)
- f2f7_2 := int64(f2_2) * int64(f7)
- f2f8_38 := int64(f2_2) * int64(f8_19)
- f2f9_38 := int64(f2) * int64(f9_38)
- f3f3_2 := int64(f3_2) * int64(f3)
- f3f4_2 := int64(f3_2) * int64(f4)
- f3f5_4 := int64(f3_2) * int64(f5_2)
- f3f6_2 := int64(f3_2) * int64(f6)
- f3f7_76 := int64(f3_2) * int64(f7_38)
- f3f8_38 := int64(f3_2) * int64(f8_19)
- f3f9_76 := int64(f3_2) * int64(f9_38)
- f4f4 := int64(f4) * int64(f4)
- f4f5_2 := int64(f4_2) * int64(f5)
- f4f6_38 := int64(f4_2) * int64(f6_19)
- f4f7_38 := int64(f4) * int64(f7_38)
- f4f8_38 := int64(f4_2) * int64(f8_19)
- f4f9_38 := int64(f4) * int64(f9_38)
- f5f5_38 := int64(f5) * int64(f5_38)
- f5f6_38 := int64(f5_2) * int64(f6_19)
- f5f7_76 := int64(f5_2) * int64(f7_38)
- f5f8_38 := int64(f5_2) * int64(f8_19)
- f5f9_76 := int64(f5_2) * int64(f9_38)
- f6f6_19 := int64(f6) * int64(f6_19)
- f6f7_38 := int64(f6) * int64(f7_38)
- f6f8_38 := int64(f6_2) * int64(f8_19)
- f6f9_38 := int64(f6) * int64(f9_38)
- f7f7_38 := int64(f7) * int64(f7_38)
- f7f8_38 := int64(f7_2) * int64(f8_19)
- f7f9_76 := int64(f7_2) * int64(f9_38)
- f8f8_19 := int64(f8) * int64(f8_19)
- f8f9_38 := int64(f8) * int64(f9_38)
- f9f9_38 := int64(f9) * int64(f9_38)
- h0 := f0f0 + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38
- h1 := f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38
- h2 := f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19
- h3 := f0f3_2 + f1f2_2 + f4f9_38 + f5f8_38 + f6f7_38
- h4 := f0f4_2 + f1f3_4 + f2f2 + f5f9_76 + f6f8_38 + f7f7_38
- h5 := f0f5_2 + f1f4_2 + f2f3_2 + f6f9_38 + f7f8_38
- h6 := f0f6_2 + f1f5_4 + f2f4_2 + f3f3_2 + f7f9_76 + f8f8_19
- h7 := f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38
- h8 := f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4 + f9f9_38
- h9 := f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2
- var carry [10]int64
+ h0, h1, h2, h3, h4, h5, h6, h7, h8, h9 := feSquare(f)
h0 += h0
h1 += h1
@@ -832,59 +622,7 @@ func FeSquare2(h, f *FieldElement) {
h8 += h8
h9 += h9
- carry[0] = (h0 + (1 << 25)) >> 26
- h1 += carry[0]
- h0 -= carry[0] << 26
- carry[4] = (h4 + (1 << 25)) >> 26
- h5 += carry[4]
- h4 -= carry[4] << 26
-
- carry[1] = (h1 + (1 << 24)) >> 25
- h2 += carry[1]
- h1 -= carry[1] << 25
- carry[5] = (h5 + (1 << 24)) >> 25
- h6 += carry[5]
- h5 -= carry[5] << 25
-
- carry[2] = (h2 + (1 << 25)) >> 26
- h3 += carry[2]
- h2 -= carry[2] << 26
- carry[6] = (h6 + (1 << 25)) >> 26
- h7 += carry[6]
- h6 -= carry[6] << 26
-
- carry[3] = (h3 + (1 << 24)) >> 25
- h4 += carry[3]
- h3 -= carry[3] << 25
- carry[7] = (h7 + (1 << 24)) >> 25
- h8 += carry[7]
- h7 -= carry[7] << 25
-
- carry[4] = (h4 + (1 << 25)) >> 26
- h5 += carry[4]
- h4 -= carry[4] << 26
- carry[8] = (h8 + (1 << 25)) >> 26
- h9 += carry[8]
- h8 -= carry[8] << 26
-
- carry[9] = (h9 + (1 << 24)) >> 25
- h0 += carry[9] * 19
- h9 -= carry[9] << 25
-
- carry[0] = (h0 + (1 << 25)) >> 26
- h1 += carry[0]
- h0 -= carry[0] << 26
-
- h[0] = int32(h0)
- h[1] = int32(h1)
- h[2] = int32(h2)
- h[3] = int32(h3)
- h[4] = int32(h4)
- h[5] = int32(h5)
- h[6] = int32(h6)
- h[7] = int32(h7)
- h[8] = int32(h8)
- h[9] = int32(h9)
+ FeCombine(h, h0, h1, h2, h3, h4, h5, h6, h7, h8, h9)
}
func FeInvert(out, z *FieldElement) {