Issue #2895 - Implement 32-bit compatible Xoroshiro128++

This puts in a reduced-register version of Xoroshiro128++ which /just/ fits in our x86 7-register space by making use of an extra mState to temporarily store the result to be passed back. Resolves #2895
2026-05-26 13:58:49 +00:00 · 2026-01-27 11:43:43 +01:00
parent e5497c8425
commit 0dbad452e6
10 changed files with 68 additions and 83 deletions
@@ -12059,26 +12059,23 @@ CodeGenerator::visitRandom(LRandom* ins)
    Register64 s0Reg(ToRegister(ins->temp1()));
    Register64 s1Reg(ToRegister(ins->temp2()));
    // Helper registers for intermediate and final results
-    Register64 imr1Reg(ToRegister(ins->temp3()));
-    Register64 imr2Reg(ToRegister(ins->temp4()));
-    Register64 resultReg(ToRegister(ins->temp5()));
+    Register64 imrReg(ToRegister(ins->temp3()));
 #else
    Register64 s0Reg(ToRegister(ins->temp1()), ToRegister(ins->temp2()));
    Register64 s1Reg(ToRegister(ins->temp3()), ToRegister(ins->temp4()));
    // Helper registers for intermediate and final results
-    Register64 imr1Reg(ToRegister(ins->temp5()), ToRegister(ins->temp6()));
-    Register64 imr2Reg(ToRegister(ins->temp7()), ToRegister(ins->temp8()));
-    Register64 resultReg(ToRegister(ins->temp9()), ToRegister(ins->temp10()));
+    Register64 imrReg(ToRegister(ins->temp5()), ToRegister(ins->temp6()));
 #endif

    const void* rng = gen->compartment->addressOfRandomNumberGenerator();
    masm.movePtr(ImmPtr(rng), tempReg);

-    static_assert(sizeof(Xoroshiro128PlusPlusRNG) == 2 * sizeof(uint64_t),
-                  "Code below assumes Xoroshiro128PlusPlusRNG contains two uint64_t values");
+    static_assert(sizeof(Xoroshiro128PlusPlusRNG) == 3 * sizeof(uint64_t),
+                  "Code below assumes Xoroshiro128PlusPlusRNG contains three uint64_t values");

    Address state0Addr(tempReg, Xoroshiro128PlusPlusRNG::offsetOfState0());
    Address state1Addr(tempReg, Xoroshiro128PlusPlusRNG::offsetOfState1());
+    Address state2Addr(tempReg, Xoroshiro128PlusPlusRNG::offsetOfState2());

    // const uint64_t s0 = mState[0];
    masm.load64(state0Addr, s0Reg);
@@ -12086,49 +12083,61 @@ CodeGenerator::visitRandom(LRandom* ins)
    masm.load64(state1Addr, s1Reg);
    
    // const uint64_t result = rotl(s0 + s1, 17) + s0;
-    masm.move64(s0Reg, imr1Reg);
-    masm.add64(s1Reg, imr1Reg);
+    masm.move64(s0Reg, imrReg);
+    masm.add64(s1Reg, imrReg);
 #ifdef JS_PUNBOX64
-    masm.rotateLeft64(Imm32(17), imr1Reg, resultReg);
+    masm.rotateLeft64(Imm32(17), imrReg, imrReg);
 #else
-    masm.rotateLeft64(Imm32(17), imr1Reg, resultReg, tempReg);
+    masm.Push(tempReg);
+    masm.rotateLeft64(Imm32(17), imrReg, imrReg, tempReg);
+    masm.Pop(tempReg);
 #endif
-    masm.add64(s0Reg, resultReg);
+    masm.add64(s0Reg, imrReg);
+    
+    // Store the result in mState[2], freeing up the intermediate register again.
+    masm.store64(imrReg, state2Addr);
    
    // s1 ^= s0;
    masm.xor64(s0Reg, s1Reg);
    
-    // mState[0] = rotl(s0, 49) ^ s1 ^ (s1 << 21); // a, b
-#ifdef JS_PUNBOX64
-    masm.rotateLeft64(Imm32(49), s0Reg, imr1Reg);   // imr = s0 rotl 49
-#else
-    masm.rotateLeft64(Imm32(49), s0Reg, imr1Reg, tempReg);   // imr = s0 rotl 49
-#endif
-    masm.xor64(s1Reg, imr1Reg);                     // imr ^ s1
-    masm.move64(s1Reg, imr2Reg);                    // imr2 = s1
-    masm.lshift64(Imm32(21), imr2Reg);              // imr2 << 21
-    masm.xor64(imr2Reg, imr1Reg);                   // imr ^ imr2
-    masm.store64(imr1Reg, state0Addr);
-
    // mState[1] = rotl(s1, 28); // c
 #ifdef JS_PUNBOX64
-    masm.rotateLeft64(Imm32(28), s1Reg, imr1Reg);
+    masm.rotateLeft64(Imm32(28), s1Reg, imrReg);
 #else
-    masm.rotateLeft64(Imm32(28), s1Reg, imr1Reg, tempReg);
+    masm.Push(tempReg);
+    masm.rotateLeft64(Imm32(28), s1Reg, imrReg, tempReg);
+    masm.Pop(tempReg);
 #endif
-    masm.store64(imr1Reg, state1Addr);
+    masm.store64(imrReg, state1Addr);
+
+    // mState[0] = rotl(s0, 49) ^ s1 ^ (s1 << 21); // a, b
+#ifdef JS_PUNBOX64
+    masm.rotateLeft64(Imm32(49), s0Reg, s0Reg);    // s0 rotl 49
+#else
+    masm.Push(tempReg);
+    masm.rotateLeft64(Imm32(49), s0Reg, s0Reg, tempReg);   // s0 rotl 49
+    masm.Pop(tempReg);
+#endif
+    masm.move64(s1Reg, imrReg);                    // imr = s1
+    masm.lshift64(Imm32(21), imrReg);              // imr << 21
+    masm.xor64(imrReg, s0Reg);                     // s0 ^= imr
+    masm.xor64(s1Reg, s0Reg);                      // s0 ^= s1
+    masm.store64(s0Reg, state0Addr);
+
+    // Recall the result from mState[2]
+    masm.load64(state2Addr, s1Reg);

    // See comment in Xoroshiro128PlusPlusRNG::nextDouble().
    static const int MantissaBits = FloatingPoint<double>::kExponentShift + 1;
    static const double ScaleInv = double(1) / (1ULL << MantissaBits);

    // Mask the result bits to mantissa size
-    masm.and64(Imm64((1ULL << MantissaBits) - 1), resultReg);
+    masm.and64(Imm64((1ULL << MantissaBits) - 1), s1Reg);

    if (masm.convertUInt64ToDoubleNeedsTemp())
-        masm.convertUInt64ToDouble(resultReg, output, tempReg);
+        masm.convertUInt64ToDouble(s1Reg, output, tempReg);
    else
-        masm.convertUInt64ToDouble(resultReg, output, Register::Invalid());
+        masm.convertUInt64ToDouble(s1Reg, output, Register::Invalid());

    // output *= ScaleInv
    masm.mulDoublePtr(ImmPtr(&ScaleInv), tempReg, output);
@@ -968,10 +968,6 @@ void
 LIRGeneratorARM::visitRandom(MRandom* ins)
 {
    LRandom *lir = new(alloc()) LRandom(temp(),
-                                        temp(),
-                                        temp(),
-                                        temp(),
-                                        temp(),
                                        temp(),
                                        temp(),
                                        temp(),
@@ -326,8 +326,6 @@ void
 LIRGeneratorARM64::visitRandom(MRandom* ins)
 {
    LRandom *lir = new(alloc()) LRandom(temp(),
-                                        temp(),
-                                        temp(),
                                        temp(),
                                        temp(),
                                        temp());
@@ -249,10 +249,6 @@ void
 LIRGeneratorMIPS::visitRandom(MRandom* ins)
 {
    LRandom *lir = new(alloc()) LRandom(temp(),
-                                        temp(),
-                                        temp(),
-                                        temp(),
-                                        temp(),
                                        temp(),
                                        temp(),
                                        temp(),
@@ -179,8 +179,6 @@ void
 LIRGeneratorMIPS64::visitRandom(MRandom* ins)
 {
    LRandom *lir = new(alloc()) LRandom(temp(), 
-                                        temp(),
-                                        temp(),
                                        temp(),
                                        temp(),
                                        temp());
@@ -8220,9 +8220,9 @@ class LArrowNewTarget : public LInstructionHelper<BOX_PIECES, 1, 0>

 // Math.random().
 #ifdef JS_PUNBOX64
-# define LRANDOM_NUM_TEMPS 6
+# define LRANDOM_NUM_TEMPS 4
 #else
-# define LRANDOM_NUM_TEMPS 11
+# define LRANDOM_NUM_TEMPS 7
 #endif

 class LRandom : public LInstructionHelper<1, 0, LRANDOM_NUM_TEMPS>
@@ -8230,12 +8230,10 @@ class LRandom : public LInstructionHelper<1, 0, LRANDOM_NUM_TEMPS>
  public:
    LIR_HEADER(Random)
    LRandom(const LDefinition &temp0, const LDefinition &temp1,
-            const LDefinition &temp2, const LDefinition &temp3,
-            const LDefinition &temp4, const LDefinition &temp5
+            const LDefinition &temp2, const LDefinition &temp3
 #ifndef JS_PUNBOX64
-            , const LDefinition &temp6, const LDefinition &temp7
-            , const LDefinition &temp8, const LDefinition &temp9
-            , const LDefinition &temp10
+            , const LDefinition &temp4, const LDefinition &temp5
+            , const LDefinition &temp6
 #endif
            )
    {
@@ -8243,14 +8241,10 @@ class LRandom : public LInstructionHelper<1, 0, LRANDOM_NUM_TEMPS>
        setTemp(1, temp1);
        setTemp(2, temp2);
        setTemp(3, temp3);
+#ifndef JS_PUNBOX64
        setTemp(4, temp4);
        setTemp(5, temp5);
-#ifndef JS_PUNBOX64
        setTemp(6, temp6);
-        setTemp(7, temp7);
-        setTemp(8, temp8);
-        setTemp(9, temp9);
-        setTemp(10, temp10);
 #endif
    }
    const LDefinition* temp0() {
@@ -8265,28 +8259,16 @@ class LRandom : public LInstructionHelper<1, 0, LRANDOM_NUM_TEMPS>
    const LDefinition* temp3() {
        return getTemp(3);
    }
+#ifndef JS_PUNBOX64
    const LDefinition* temp4() {
        return getTemp(4);
    }
    const LDefinition* temp5() {
        return getTemp(5);
    }
-#ifndef JS_PUNBOX64
    const LDefinition* temp6() {
        return getTemp(6);
    }
-    const LDefinition* temp7() {
-        return getTemp(7);
-    }
-    const LDefinition* temp8() {
-        return getTemp(8);
-    }
-    const LDefinition* temp9() {
-        return getTemp(9);
-    }
-    const LDefinition* temp10() {
-        return getTemp(10);
-    }
 #endif

    MRandom* mir() const {
@@ -412,8 +412,6 @@ void
 LIRGeneratorX64::visitRandom(MRandom* ins)
 {
    LRandom *lir = new(alloc()) LRandom(temp(),
-                                        temp(),
-                                        temp(),
                                        temp(),
                                        temp(),
                                        temp());
@@ -607,10 +607,6 @@ void
 LIRGeneratorX86::visitRandom(MRandom* ins)
 {
    LRandom *lir = new(alloc()) LRandom(temp(),
-                                        temp(),
-                                        temp(),
-                                        temp(),
-                                        temp(),
                                        temp(),
                                        temp(),
                                        temp(),
@@ -31,15 +31,17 @@ const original = () => {
  var startTime = performance.now();
  const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
  const data = imageData.data;
-  for (let i = 0; i < data.length; i += 4) {
+  for (let j = 0; j < 10; j++) { // 10x for performance measurement
+   for (let i = 0; i < data.length; i += 4) {
    data[i] = Math.random()*255; // red
    data[i + 1] = Math.random()*255; // green
    data[i + 2] = Math.random()*255; // blue
    data[i+3] = 255;
+   }
  }
  ctx.putImageData(imageData, 0, 0);
  var endTime = performance.now();
-  duration.innerHTML = "Total pixels: " + (data.length / 4) +" -- Time taken: " + (endTime - startTime) + " ms";
+  duration.innerHTML = "Total pixels: " + (10 * data.length / 4) +" -- Time taken: " + (endTime - startTime) + " ms";
 };

 const invert = () => {
@@ -60,16 +62,18 @@ const grayscale = () => {
  var startTime = performance.now();
  const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
  const data = imageData.data;
-  for (let i = 0; i < data.length; i += 4) {
+  for (let j = 0; j < 10; j++) { // 10x for performance measurement
+   for (let i = 0; i < data.length; i += 4) {
    const avg = Math.random()*255;
    data[i] = avg; // red
    data[i + 1] = avg; // green
    data[i + 2] = avg; // blue
    data[i+3] = 255;
+   }
  }
  ctx.putImageData(imageData, 0, 0);
  var endTime = performance.now();
-  duration.innerHTML = "Total pixels: " + (data.length / 4) +" -- Time taken: " + (endTime - startTime) + " ms";
+  duration.innerHTML = "Total pixels: " + (10 * data.length / 4) +" -- Time taken: " + (endTime - startTime) + " ms";
 };

 const sepia = () => {
@@ -32,12 +32,16 @@ namespace non_crypto {
 *     the same speed and use half of the space; the same comments apply.
 *     They are suitable only for low-scale parallel applications.
 *
- * The stream of numbers produced by this method repeats every 2**256 - 1 calls (i.e. never, for all practical
+ * The stream of numbers produced by this method repeats every 2^128 - 1 calls (i.e. never, for all practical
 * purposes).
 *
 */
 class Xoroshiro128PlusPlusRNG {
-  uint64_t mState[2];
+  /*
+   * mState[0] and mState[1] are as-described in the Xoroshiro128++ paper.
+   * mState[2] is used for temporary storage of the result in JIT code.
+   */
+  uint64_t mState[3];

 public:
  /*
@@ -82,15 +86,15 @@ class Xoroshiro128PlusPlusRNG {

  /*
   * Return a pseudo-random floating-point value in the range [0, 1). More
-   * precisely, choose an integer in the range [0, 2**53) and divide it by
-   * 2**53. Given the 2**256 - 1 period noted above, the produced doubles are
+   * precisely, choose an integer in the range [0, 2^53) and divide it by
+   * 2^53. Given the 2^128 - 1 period noted above, the produced doubles are
   * all but uniformly distributed in this range.
   */
  double nextDouble() {
    /*
     * Because the IEEE 64-bit floating point format stores the leading '1' bit
     * of the mantissa implicitly, it effectively represents a mantissa in the
-     * range [0, 2**53) in only 52 bits. FloatingPoint<double>::kExponentShift
+     * range [0, 2^53) in only 52 bits. FloatingPoint<double>::kExponentShift
     * is the width of the bitfield in the in-memory format, so we must add one
     * to get the mantissa's range.
     */
@@ -109,6 +113,7 @@ class Xoroshiro128PlusPlusRNG {
    MOZ_ASSERT(aState0 || aState1);
    mState[0] = aState0;
    mState[1] = aState1;
+    mState[2] = 0; // Could be left uninitialized, but we do this just-in-case.
  }

  static size_t offsetOfState0() {
@@ -117,6 +122,9 @@ class Xoroshiro128PlusPlusRNG {
  static size_t offsetOfState1() {
    return offsetof(Xoroshiro128PlusPlusRNG, mState[1]);
  }
+  static size_t offsetOfState2() {
+    return offsetof(Xoroshiro128PlusPlusRNG, mState[2]);
+  }
 };

 } // namespace non_crypto