/build/rocrand-7S8maf/rocrand-7.1.1/library/include/rocrand/rocrand_mrg32k3a.h Source File

/build/rocrand-7S8maf/rocrand-7.1.1/library/include/rocrand/rocrand_mrg32k3a.h Source File#

API library: /build/rocrand-7S8maf/rocrand-7.1.1/library/include/rocrand/rocrand_mrg32k3a.h Source File
rocrand_mrg32k3a.h
1// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19// THE SOFTWARE.
20
21#ifndef ROCRAND_MRG32K3A_H_
22#define ROCRAND_MRG32K3A_H_
23
24#include "rocrand/rocrand_common.h"
25#include "rocrand/rocrand_mrg32k3a_precomputed.h"
26
27#include <hip/hip_runtime.h>
28
29#define ROCRAND_MRG32K3A_POW32 4294967296U
30#define ROCRAND_MRG32K3A_M1 4294967087U
31#define ROCRAND_MRG32K3A_M1C 209U
32#define ROCRAND_MRG32K3A_M2 4294944443U
33#define ROCRAND_MRG32K3A_M2C 22853U
34#define ROCRAND_MRG32K3A_A12 1403580U
35#define ROCRAND_MRG32K3A_A13 (4294967087U - 810728U)
36#define ROCRAND_MRG32K3A_A13N 810728U
37#define ROCRAND_MRG32K3A_A21 527612U
38#define ROCRAND_MRG32K3A_A23 (4294944443U - 1370589U)
39#define ROCRAND_MRG32K3A_A23N 1370589U
40#define ROCRAND_MRG32K3A_NORM_DOUBLE (2.3283065498378288e-10) // 1/ROCRAND_MRG32K3A_M1
41#define ROCRAND_MRG32K3A_UINT_NORM \
42 (1.000000048661607) // (ROCRAND_MRG32K3A_POW32 - 1)/(ROCRAND_MRG32K3A_M1 - 1)
43
52 #define ROCRAND_MRG32K3A_DEFAULT_SEED 12345ULL // end of group rocranddevice
54
55namespace rocrand_device {
56
57class mrg32k3a_engine
58{
59public:
60 struct mrg32k3a_state
61 {
62 unsigned int g1[3];
63 unsigned int g2[3];
64
65 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
66 // The Box–Muller transform requires two inputs to convert uniformly
67 // distributed real values [0; 1] to normally distributed real values
68 // (with mean = 0, and stddev = 1). Often user wants only one
69 // normally distributed number, to save performance and random
70 // numbers the 2nd value is saved for future requests.
71 unsigned int boxmuller_float_state; // is there a float in boxmuller_float
72 unsigned int boxmuller_double_state; // is there a double in boxmuller_double
73 float boxmuller_float; // normally distributed float
74 double boxmuller_double; // normally distributed double
75 #endif
76 };
77
78 __forceinline__ __device__ __host__ mrg32k3a_engine()
79 {
80 this->seed(ROCRAND_MRG32K3A_DEFAULT_SEED, 0, 0);
81 }
82
91 __forceinline__ __device__ __host__ mrg32k3a_engine(const unsigned long long seed,
92 const unsigned long long subsequence,
93 const unsigned long long offset)
94 {
95 this->seed(seed, subsequence, offset);
96 }
97
106 __forceinline__ __device__ __host__ void seed(unsigned long long seed_value,
107 const unsigned long long subsequence,
108 const unsigned long long offset)
109 {
110 if(seed_value == 0)
111 {
113 }
114 unsigned int x = (unsigned int) seed_value ^ 0x55555555U;
115 unsigned int y = (unsigned int) ((seed_value >> 32) ^ 0xAAAAAAAAU);
116 m_state.g1[0] = mod_mul_m1(x, seed_value);
117 m_state.g1[1] = mod_mul_m1(y, seed_value);
118 m_state.g1[2] = mod_mul_m1(x, seed_value);
119 m_state.g2[0] = mod_mul_m2(y, seed_value);
120 m_state.g2[1] = mod_mul_m2(x, seed_value);
121 m_state.g2[2] = mod_mul_m2(y, seed_value);
122 this->restart(subsequence, offset);
123 }
124
126 __forceinline__ __device__ __host__ void discard(unsigned long long offset)
127 {
128 this->discard_impl(offset);
129 }
130
133 __forceinline__ __device__ __host__ void discard_subsequence(unsigned long long subsequence)
134 {
135 this->discard_subsequence_impl(subsequence);
136 }
137
140 __forceinline__ __device__ __host__ void discard_sequence(unsigned long long sequence)
141 {
142 this->discard_sequence_impl(sequence);
143 }
144
145 __forceinline__ __device__ __host__ void restart(const unsigned long long subsequence,
146 const unsigned long long offset)
147 {
148 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
149 m_state.boxmuller_float_state = 0;
150 m_state.boxmuller_double_state = 0;
151 #endif
152 this->discard_subsequence_impl(subsequence);
153 this->discard_impl(offset);
154 }
155
156 __forceinline__ __device__ __host__ unsigned int operator()()
157 {
158 return this->next();
159 }
160
161 // Returned value is in range [1, ROCRAND_MRG32K3A_M1],
162 // where ROCRAND_MRG32K3A_M1 < UINT_MAX
163 __forceinline__ __device__ __host__
164 unsigned int next()
165 {
166 const unsigned int p1 = mod_m1(detail::mad_u64_u32(
167 ROCRAND_MRG32K3A_A12,
168 m_state.g1[1],
169 detail::mul_u64_u32(ROCRAND_MRG32K3A_A13N, (ROCRAND_MRG32K3A_M1 - m_state.g1[0]))));
170
171 m_state.g1[0] = m_state.g1[1];
172 m_state.g1[1] = m_state.g1[2];
173 m_state.g1[2] = p1;
174
175 const unsigned int p2 = mod_m2(detail::mad_u64_u32(
176 ROCRAND_MRG32K3A_A21,
177 m_state.g2[2],
178 detail::mul_u64_u32(ROCRAND_MRG32K3A_A23N, (ROCRAND_MRG32K3A_M2 - m_state.g2[0]))));
179
180 m_state.g2[0] = m_state.g2[1];
181 m_state.g2[1] = m_state.g2[2];
182 m_state.g2[2] = p2;
183
184 return (p1 - p2) + (p1 <= p2 ? ROCRAND_MRG32K3A_M1 : 0);
185 }
186
187protected:
188 // Advances the internal state to skip \p offset numbers.
189 // DOES NOT CALCULATE NEW ULONGLONG
190 __forceinline__ __device__ __host__ void discard_impl(unsigned long long offset)
191 {
192 discard_state(offset);
193 }
194
195 // DOES NOT CALCULATE NEW ULONGLONG
196 __forceinline__ __device__ __host__ void
197 discard_subsequence_impl(unsigned long long subsequence)
198 {
199 int i = 0;
200
201 while(subsequence > 0) {
202 if (subsequence & 1) {
203 #if defined(__HIP_DEVICE_COMPILE__)
204 mod_mat_vec_m1(d_A1P76 + i, m_state.g1);
205 mod_mat_vec_m2(d_A2P76 + i, m_state.g2);
206 #else
207 mod_mat_vec_m1(h_A1P76 + i, m_state.g1);
208 mod_mat_vec_m2(h_A2P76 + i, m_state.g2);
209 #endif
210 }
211 subsequence >>= 1;
212 i += 9;
213 }
214 }
215
216 // DOES NOT CALCULATE NEW ULONGLONG
217 __forceinline__ __device__ __host__ void discard_sequence_impl(unsigned long long sequence)
218 {
219 int i = 0;
220
221 while(sequence > 0) {
222 if (sequence & 1) {
223 #if defined(__HIP_DEVICE_COMPILE__)
224 mod_mat_vec_m1(d_A1P127 + i, m_state.g1);
225 mod_mat_vec_m2(d_A2P127 + i, m_state.g2);
226 #else
227 mod_mat_vec_m1(h_A1P127 + i, m_state.g1);
228 mod_mat_vec_m2(h_A2P127 + i, m_state.g2);
229 #endif
230 }
231 sequence >>= 1;
232 i += 9;
233 }
234 }
235
236 // Advances the internal state by offset times.
237 // DOES NOT CALCULATE NEW ULONGLONG
238 __forceinline__ __device__ __host__ void discard_state(unsigned long long offset)
239 {
240 int i = 0;
241
242 while(offset > 0) {
243 if (offset & 1) {
244 #if defined(__HIP_DEVICE_COMPILE__)
245 mod_mat_vec_m1(d_A1 + i, m_state.g1);
246 mod_mat_vec_m2(d_A2 + i, m_state.g2);
247 #else
248 mod_mat_vec_m1(h_A1 + i, m_state.g1);
249 mod_mat_vec_m2(h_A2 + i, m_state.g2);
250 #endif
251 }
252 offset >>= 1;
253 i += 9;
254 }
255 }
256
257 // Advances the internal state to the next state
258 // DOES NOT CALCULATE NEW ULONGLONG
259 __forceinline__ __device__ __host__ void discard_state()
260 {
261 discard_state(1);
262 }
263
264private:
265 __forceinline__ __device__ __host__
266 static void mod_mat_vec_m1(const unsigned int* A, unsigned int* s)
267 {
268 unsigned long long x[3] = {s[0], s[1], s[2]};
269
270 s[0] = mod_m1(mod_m1(A[0] * x[0]) + mod_m1(A[1] * x[1]) + mod_m1(A[2] * x[2]));
271
272 s[1] = mod_m1(mod_m1(A[3] * x[0]) + mod_m1(A[4] * x[1]) + mod_m1(A[5] * x[2]));
273
274 s[2] = mod_m1(mod_m1(A[6] * x[0]) + mod_m1(A[7] * x[1]) + mod_m1(A[8] * x[2]));
275 }
276
277 __forceinline__ __device__ __host__
278 static void mod_mat_vec_m2(const unsigned int* A, unsigned int* s)
279 {
280 unsigned long long x[3] = {s[0], s[1], s[2]};
281
282 s[0] = mod_m2(mod_m2(A[0] * x[0]) + mod_m2(A[1] * x[1]) + mod_m2(A[2] * x[2]));
283
284 s[1] = mod_m2(mod_m2(A[3] * x[0]) + mod_m2(A[4] * x[1]) + mod_m2(A[5] * x[2]));
285
286 s[2] = mod_m2(mod_m2(A[6] * x[0]) + mod_m2(A[7] * x[1]) + mod_m2(A[8] * x[2]));
287 }
288
289 __forceinline__ __device__ __host__ static unsigned long long mod_mul_m1(unsigned int i,
290 unsigned long long j)
291 {
292 long long hi, lo, temp1, temp2;
293
294 hi = i / 131072;
295 lo = i - (hi * 131072);
296 temp1 = mod_m1(hi * j) * 131072;
297 temp2 = mod_m1(lo * j);
298 lo = mod_m1(temp1 + temp2);
299
300 if (lo < 0)
301 lo += ROCRAND_MRG32K3A_M1;
302 return lo;
303 }
304
305 __forceinline__ __device__ __host__
306 static unsigned long long mod_m1(unsigned long long p)
307 {
308 p = detail::mad_u64_u32(ROCRAND_MRG32K3A_M1C,
309 static_cast<unsigned int>(p >> 32),
310 static_cast<unsigned int>(p));
311 if(p >= ROCRAND_MRG32K3A_M1)
312 p -= ROCRAND_MRG32K3A_M1;
313
314 return p;
315 }
316
317 __forceinline__ __device__ __host__
318 static unsigned long long mod_mul_m2(unsigned int i, unsigned long long j)
319 {
320 long long hi, lo, temp1, temp2;
321
322 hi = i / 131072;
323 lo = i - (hi * 131072);
324 temp1 = mod_m2(hi * j) * 131072;
325 temp2 = mod_m2(lo * j);
326 lo = mod_m2(temp1 + temp2);
327
328 if (lo < 0)
329 lo += ROCRAND_MRG32K3A_M2;
330 return lo;
331 }
332
333 __forceinline__ __device__ __host__
334 static unsigned long long mod_m2(unsigned long long p)
335 {
336 p = detail::mad_u64_u32(ROCRAND_MRG32K3A_M2C,
337 static_cast<unsigned int>(p >> 32),
338 static_cast<unsigned int>(p));
339 p = detail::mad_u64_u32(ROCRAND_MRG32K3A_M2C,
340 static_cast<unsigned int>(p >> 32),
341 static_cast<unsigned int>(p));
342 if(p >= ROCRAND_MRG32K3A_M2)
343 p -= ROCRAND_MRG32K3A_M2;
344
345 return p;
346 }
347
348protected:
349 // State
350 mrg32k3a_state m_state;
351
352 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
353 friend struct detail::engine_boxmuller_helper<mrg32k3a_engine>;
354 #endif
355
356}; // mrg32k3a_engine class
357
358} // end namespace rocrand_device
359
364
366typedef rocrand_device::mrg32k3a_engine rocrand_state_mrg32k3a;
368
380__forceinline__ __device__ __host__
381void rocrand_init(const unsigned long long seed,
382 const unsigned long long subsequence,
383 const unsigned long long offset,
384 rocrand_state_mrg32k3a* state)
385{
386 *state = rocrand_state_mrg32k3a(seed, subsequence, offset);
387}
388
401__forceinline__ __device__ __host__
402unsigned int rocrand(rocrand_state_mrg32k3a* state)
403{
404 // next() in [1, ROCRAND_MRG32K3A_M1]
405 return static_cast<unsigned int>((state->next() - 1) * ROCRAND_MRG32K3A_UINT_NORM);
406}
407
416__forceinline__ __device__ __host__
417void skipahead(unsigned long long offset, rocrand_state_mrg32k3a* state)
418{
419 return state->discard(offset);
420}
421
431__forceinline__ __device__ __host__
432void skipahead_subsequence(unsigned long long subsequence, rocrand_state_mrg32k3a* state)
433{
434 return state->discard_subsequence(subsequence);
435}
436
446__forceinline__ __device__ __host__
447void skipahead_sequence(unsigned long long sequence, rocrand_state_mrg32k3a* state)
448{
449 return state->discard_sequence(sequence);
450}
451 // end of group rocranddevice
453
454#endif // ROCRAND_MRG32K3A_H_
#define ROCRAND_MRG32K3A_DEFAULT_SEED
Default seed for MRG32K3A PRNG.
Definition rocrand_mrg32k3a.h:52
__forceinline__ __device__ __host__ void rocrand_init(const unsigned long long seed, const unsigned long long subsequence, const unsigned long long offset, rocrand_state_mrg32k3a *state)
Initializes MRG32K3A state.
Definition rocrand_mrg32k3a.h:381
__forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long long subsequence, rocrand_state_mrg32k3a *state)
Updates MRG32K3A state to skip ahead by subsequence subsequences.
Definition rocrand_mrg32k3a.h:432
__forceinline__ __device__ __host__ void skipahead(unsigned long long offset, rocrand_state_mrg32k3a *state)
Updates MRG32K3A state to skip ahead by offset elements.
Definition rocrand_mrg32k3a.h:417
__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_mrg32k3a *state)
Returns uniformly distributed random unsigned int value from [0; 2^32 - 1] range.
Definition rocrand_mrg32k3a.h:402
__forceinline__ __device__ __host__ void skipahead_sequence(unsigned long long sequence, rocrand_state_mrg32k3a *state)
Updates MRG32K3A state to skip ahead by sequence sequences.
Definition rocrand_mrg32k3a.h:447