Line data Source code
1 : // This file contains an implementation of a vectorized cosine, which
2 : // is based in part on the implementations in the library "SLEEF" by
3 : // Naoki Shibata. SLEEF was used under the Boost Software License,
4 : // Version 1.0. The original source file contained the following
5 : // copyright notice:
6 : //
7 : // // Copyright Naoki Shibata 2010 - 2018.
8 : // // Distributed under the Boost Software License, Version 1.0.
9 : // // (See accompanying file LICENSE.txt or copy at
10 : // // http://www.boost.org/LICENSE_1_0.txt)
11 : //
12 : // SLEEF was used under the following license, which is not necessarily the
13 : // license that applies to this file:
14 : //
15 : // Boost Software License - Version 1.0 - August 17th, 2003
16 : //
17 : // Permission is hereby granted, free of charge, to any person or
18 : // organization obtaining a copy of the software and accompanying
19 : // documentation covered by this license (the "Software") to use,
20 : // reproduce, display, distribute, execute, and transmit the Software,
21 : // and to prepare derivative works of the Software, and to permit
22 : // third-parties to whom the Software is furnished to do so, all subject
23 : // to the following:
24 : //
25 : // The copyright notices in the Software and this entire statement,
26 : // including the above license grant, this restriction and the following
27 : // disclaimer, must be included in all copies of the Software, in whole
28 : // or in part, and all derivative works of the Software, unless such
29 : // copies or derivative works are solely in the form of
30 : // machine-executable object code generated by a source language
31 : // processor.
32 : //
33 : // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
34 : // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
35 : // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND
36 : // NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR ANYONE
37 : // DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR OTHER
38 : // LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
39 : // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
40 : // THE SOFTWARE.
41 :
42 : #include "crpropa/magneticField/turbulentField/PlaneWaveTurbulence.h"
43 : #include "crpropa/GridTools.h"
44 : #include "crpropa/Random.h"
45 : #include "crpropa/Units.h"
46 :
47 : #include "kiss/logger.h"
48 :
49 : #include <iostream>
50 :
51 : #if defined(FAST_WAVES)
52 : #if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__) && defined(__SSE4_1__) && defined(__SSE4_2__) && defined(__AVX__)
53 : #define ENABLE_FAST_WAVES
54 : #else
55 : #error "FAST_WAVES is enabled, but it appears that not all required SIMD extensions are enabled in the compiler. Without these extensions, the FAST_WAVES optimization cannot be used. Please make sure that the SIMD_EXTENSIONS option in cmake matches the capabilities of your target CPU, or (if your target CPU does not support the required extensions), disable the FAST_WAVES flag in cmake."
56 : #endif
57 : #endif
58 :
59 :
60 : #ifdef ENABLE_FAST_WAVES
61 : #include <immintrin.h>
62 : #include <memory>
63 : #endif
64 :
65 : namespace crpropa {
66 : #ifdef ENABLE_FAST_WAVES
67 : // see
68 : // https://stackoverflow.com/questions/49941645/get-sum-of-values-stored-in-m256d-with-sse-avx
69 : double hsum_double_avx(__m256d v) {
70 : __m128d vlow = _mm256_castpd256_pd128(v);
71 : __m128d vhigh = _mm256_extractf128_pd(v, 1); // high 128
72 : vlow = _mm_add_pd(vlow, vhigh); // reduce down to 128
73 :
74 : __m128d high64 = _mm_unpackhi_pd(vlow, vlow);
75 : return _mm_cvtsd_f64(_mm_add_sd(vlow, high64)); // reduce to scalar
76 : }
77 : #endif // defined(ENABLE_FAST_WAVES)
78 :
79 2 : PlaneWaveTurbulence::PlaneWaveTurbulence(const TurbulenceSpectrum &spectrum,
80 2 : int Nm, int seed)
81 2 : : TurbulentField(spectrum), Nm(Nm) {
82 :
83 : #ifdef ENABLE_FAST_WAVES
84 : KISS_LOG_INFO << "PlaneWaveTurbulence: Using SIMD TD13 implementation"
85 : << std::endl;
86 :
87 : // There used to be a cpuid check here, to see if the cpu running
88 : // this code would support SIMD (SSE + AVX). However, the library providing
89 : // the relevant function is no longer being used, and doing this manually
90 : // might be a bit too much work.
91 : #endif
92 :
93 2 : if (Nm <= 1) {
94 : throw std::runtime_error(
95 : "PlaneWaveTurbulence: Nm <= 1. Specify at least two wavemodes in "
96 0 : "order to generate the k distribution properly.");
97 : }
98 :
99 2 : Random random;
100 2 : if (seed != 0)
101 2 : random.seed(seed);
102 :
103 2 : double kmax = 2 * M_PI / spectrum.getLmin();
104 2 : double kmin = 2 * M_PI / spectrum.getLmax();
105 :
106 2 : xi = std::vector<Vector3d>(Nm, Vector3d(0.));
107 2 : kappa = std::vector<Vector3d>(Nm, Vector3d(0.));
108 2 : phi = std::vector<double>(Nm, 0.);
109 2 : costheta = std::vector<double>(Nm, 0.);
110 2 : beta = std::vector<double>(Nm, 0.);
111 2 : Ak = std::vector<double>(Nm, 0.);
112 2 : k = std::vector<double>(Nm, 0.);
113 :
114 2 : double delta = log10(kmax / kmin);
115 22 : for (int i = 0; i < Nm; i++) {
116 20 : k[i] = pow(10, log10(kmin) + ((double)i) / ((double)(Nm - 1)) * delta);
117 : }
118 :
119 : // * compute Ak *
120 :
121 : double delta_k0 =
122 2 : (k[1] - k[0]) / k[1]; // multiply this by k[i] to get delta_k[i]
123 : // Note: this is probably unnecessary since it's just a factor
124 : // and will get normalized out anyways. It's not like this is
125 : // performance-sensitive code though, and I don't want to change
126 : // anything numerical now.
127 :
128 : // For this loop, the Ak array actually contains Gk*delta_k (ie
129 : // non-normalized Ak^2). Normalization happens in a second loop,
130 : // once the total is known.
131 : double Ak2_sum = 0; // sum of Ak^2 over all k
132 22 : for (int i = 0; i < Nm; i++) {
133 20 : double k = this->k[i];
134 20 : double kHat = k * spectrum.getLbendover();
135 20 : double Gk = spectrum.energySpectrum(k) * (1 + kHat * kHat); // correct different implementation in TD 13 (eq. 5, missing + 1 in the denuminators exponent)
136 20 : Ak[i] = Gk * delta_k0 * k;
137 20 : Ak2_sum += Ak[i];
138 :
139 : // phi, costheta, and sintheta are for drawing vectors with
140 : // uniform distribution on the unit sphere.
141 : // This is similar to Random::randVector(): their t is our phi,
142 : // z is costheta, and r is sintheta. Our kappa is equivalent to
143 : // the return value of randVector(); however, TD13 then reuse
144 : // these values to generate a random vector perpendicular to kappa.
145 20 : double phi = random.randUniform(-M_PI, M_PI);
146 20 : double costheta = random.randUniform(-1., 1.);
147 20 : double sintheta = sqrt(1 - costheta * costheta);
148 :
149 20 : double alpha = random.randUniform(0, 2 * M_PI);
150 20 : double beta = random.randUniform(0, 2 * M_PI);
151 :
152 : Vector3d kappa =
153 20 : Vector3d(sintheta * cos(phi), sintheta * sin(phi), costheta);
154 :
155 : // NOTE: all other variable names match the ones from the TD13 paper.
156 : // However, our xi is actually their psi, and their xi isn't used at
157 : // all. (Though both can be used for the polarization vector, according
158 : // to the paper.) The reason for this discrepancy is that this code
159 : // used to be based on the original GJ99 paper, which provided only a
160 : // xi vector, and this xi happens to be almost the same as TD13's psi.
161 : Vector3d xi =
162 20 : Vector3d(costheta * cos(phi) * cos(alpha) + sin(phi) * sin(alpha),
163 20 : costheta * sin(phi) * cos(alpha) - cos(phi) * sin(alpha),
164 20 : -sintheta * cos(alpha));
165 :
166 : this->xi[i] = xi;
167 : this->kappa[i] = kappa;
168 20 : this->phi[i] = phi;
169 20 : this->costheta[i] = costheta;
170 20 : this->beta[i] = beta;
171 : }
172 :
173 : // Only in this loop are the actual Ak computed and stored.
174 : // This two-step process is necessary in order to normalize the values
175 : // properly.
176 22 : for (int i = 0; i < Nm; i++) {
177 20 : Ak[i] = sqrt(2 * Ak[i] / Ak2_sum) * spectrum.getBrms();
178 : }
179 :
180 : #ifdef ENABLE_FAST_WAVES
181 : // * copy data into AVX-compatible arrays *
182 : //
183 : // AVX requires all data to be aligned to 256 bit, or 32 bytes, which is the
184 : // same as 4 double precision floating point numbers. Since support for
185 : // alignments this big seems to be somewhat tentative in C++ allocators,
186 : // we're aligning them manually by allocating a normal double array, and
187 : // then computing the offset to the first value with the correct alignment.
188 : // This is a little bit of work, so instead of doing it separately for each
189 : // of the individual data arrays, we're doing it once for one big array that
190 : // all of the component arrays get packed into.
191 : //
192 : // The other thing to keep in mind is that AVX always reads in units of 256
193 : // bits, or 4 doubles. This means that our number of wavemodes must be
194 : // divisible by 4. If it isn't, we simply pad it out with zeros. Since the
195 : // final step of the computation of each wavemode is multiplication by the
196 : // amplitude, which will be set to 0, these padding wavemodes won't affect
197 : // the result.
198 :
199 : avx_Nm = ((Nm + 4 - 1) / 4) * 4; // round up to next larger multiple of 4:
200 : // align is 256 = 4 * sizeof(double) bit
201 : avx_data = std::vector<double>(itotal * avx_Nm + 3, 0.);
202 :
203 : // get the first 256-bit aligned element
204 : size_t size = avx_data.size() * sizeof(double);
205 : void *pointer = avx_data.data();
206 : align_offset =
207 : (double *)std::align(32, 32, pointer, size) - avx_data.data();
208 :
209 : // copy into the AVX arrays
210 : for (int i = 0; i < Nm; i++) {
211 : avx_data[i + align_offset + avx_Nm * iAxi0] = Ak[i] * xi[i].x;
212 : avx_data[i + align_offset + avx_Nm * iAxi1] = Ak[i] * xi[i].y;
213 : avx_data[i + align_offset + avx_Nm * iAxi2] = Ak[i] * xi[i].z;
214 :
215 : // The cosine implementation computes cos(pi*x), so we'll divide out the
216 : // pi here.
217 : avx_data[i + align_offset + avx_Nm * ikkappa0] =
218 : k[i] / M_PI * kappa[i].x;
219 : avx_data[i + align_offset + avx_Nm * ikkappa1] =
220 : k[i] / M_PI * kappa[i].y;
221 : avx_data[i + align_offset + avx_Nm * ikkappa2] =
222 : k[i] / M_PI * kappa[i].z;
223 :
224 : // We also need to divide beta by pi, since that goes into the argument
225 : // of the cosine as well.
226 : avx_data[i + align_offset + avx_Nm * ibeta] = beta[i] / M_PI;
227 : }
228 : #endif // ENABLE_FAST_WAVES
229 2 : }
230 :
231 45 : Vector3d PlaneWaveTurbulence::getField(const Vector3d &pos) const {
232 :
233 : #ifndef ENABLE_FAST_WAVES
234 : Vector3d B(0.);
235 495 : for (int i = 0; i < Nm; i++) {
236 450 : double z_ = pos.dot(kappa[i]);
237 450 : B += xi[i] * Ak[i] * cos(k[i] * z_ + beta[i]);
238 : }
239 45 : return B;
240 :
241 : #else // ENABLE_FAST_WAVES
242 :
243 : // Initialize accumulators
244 : //
245 : // There is one accumulator per component of the result vector.
246 : // Note that each accumulator contains four numbers. At the end of
247 : // the loop, each of these numbers will contain the sum of every
248 : // fourth wavemode, starting at a different offset. In the end, each
249 : // of the accumulator's numbers are added together (using
250 : // hsum_double_avx), resulting in the total sum for that component.
251 :
252 : __m256d acc0 = _mm256_setzero_pd();
253 : __m256d acc1 = _mm256_setzero_pd();
254 : __m256d acc2 = _mm256_setzero_pd();
255 :
256 : // broadcast position into AVX registers
257 : __m256d pos0 = _mm256_set1_pd(pos.x);
258 : __m256d pos1 = _mm256_set1_pd(pos.y);
259 : __m256d pos2 = _mm256_set1_pd(pos.z);
260 :
261 : for (int i = 0; i < avx_Nm; i += 4) {
262 :
263 : // Load data from memory into AVX registers:
264 : // - the three components of the vector A * xi
265 : __m256d Axi0 =
266 : _mm256_load_pd(avx_data.data() + i + align_offset + avx_Nm * iAxi0);
267 : __m256d Axi1 =
268 : _mm256_load_pd(avx_data.data() + i + align_offset + avx_Nm * iAxi1);
269 : __m256d Axi2 =
270 : _mm256_load_pd(avx_data.data() + i + align_offset + avx_Nm * iAxi2);
271 :
272 : // - the three components of the vector k * kappa
273 : __m256d kkappa0 = _mm256_load_pd(avx_data.data() + i + align_offset +
274 : avx_Nm * ikkappa0);
275 : __m256d kkappa1 = _mm256_load_pd(avx_data.data() + i + align_offset +
276 : avx_Nm * ikkappa1);
277 : __m256d kkappa2 = _mm256_load_pd(avx_data.data() + i + align_offset +
278 : avx_Nm * ikkappa2);
279 :
280 : // - the phase beta.
281 : __m256d beta =
282 : _mm256_load_pd(avx_data.data() + i + align_offset + avx_Nm * ibeta);
283 :
284 : // Then, do the computation.
285 :
286 : // This is the scalar product between k*kappa and pos:
287 : __m256d z = _mm256_add_pd(_mm256_mul_pd(pos0, kkappa0),
288 : _mm256_add_pd(_mm256_mul_pd(pos1, kkappa1),
289 : _mm256_mul_pd(pos2, kkappa2)));
290 :
291 : // Here, the phase is added on. This is the argument of the cosine.
292 : __m256d cos_arg = _mm256_add_pd(z, beta);
293 :
294 : // ********
295 : // * Computing the cosine
296 : // * Part 1: Argument reduction
297 : //
298 : // To understand the computation of the cosine, first note that the
299 : // cosine is periodic and we thus only need to model its behavior
300 : // between 0 and 2*pi to be able compute the function anywhere. In
301 : // fact, by mirroring the function along the x and y axes, even the
302 : // range between 0 and pi/2 is sufficient for this purpose. In this
303 : // range, the cosine can be efficiently evaluated with high precision
304 : // by using a polynomial approximation. Thus, to compute the cosine,
305 : // the input value is first reduced so that it lies within this range.
306 : // Then, the polynomial approximation is evaluated. Finally, if
307 : // necessary, the sign of the result is flipped (mirroring the function
308 : // along the x axis).
309 : //
310 : // The actual computation is slightly more involved. First, argument
311 : // reduction can be simplified drastically by computing cos(pi*x),
312 : // such that the values are reduced to the range [0, 0.5) instead of
313 : // [0, pi/2). Since the cosine is even (independent of the sign), we
314 : // can first reduce values to [-0.5, 0.5) – that is, a simple rounding
315 : // operation – and then neutralize the sign. In fact, precisely because
316 : // the cosine is even, all terms of the polynomial are powers of x^2,
317 : // so the value of x^2 (computed as x*x) forms the basis for the
318 : // polynomial approximation. If I understand things correctly, then (in
319 : // IEEE-754 floating point) x*x and (-x)*(-x) will always result in the
320 : // exact same value, which means that any error bound over [0, 0.5)
321 : // automatically applies to (-0.5, 0] as well.
322 :
323 : // First, compute round(x), and store it in q. If this value is odd,
324 : // we're looking at the negative half-wave of the cosine, and thus
325 : // will have to invert the sign of the result.
326 : __m256d q = _mm256_round_pd(
327 : cos_arg, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
328 :
329 : // Since we're computing cos(pi*x), round(x) always yields the center of
330 : // a half-wave (where cos(pi*x) achieves an extremum). This point
331 : // logically corresponds to x=0. Therefore, we subtract this center from
332 : // the actual input argument to find the corresponding point on the
333 : // half-wave that is centered around zero.
334 : __m256d s = _mm256_sub_pd(cos_arg, q);
335 :
336 : // We now want to check whether q (the index of our half-wave) is even
337 : // or odd, since all of the odd-numbered half-waves are negative, so
338 : // we'll have to flip the final result. On an int, this is as simple as
339 : // checking the 0th bit. Idea: manipulate the double in such a way that
340 : // we can do this. So, we add 2^52, such that the last digit of the
341 : // mantissa is actually in the ones' position. Since q may be negative,
342 : // we'll also add 2^51 to make sure it's positive. Note that 2^51 is
343 : // even and thus leaves evenness invariant, which is the only thing we
344 : // care about here.
345 : //
346 : // This is based on the int extraction process described here:
347 : // https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx/41223013
348 : //
349 : // We assume -2^51 <= q < 2^51 for this, which is unproblematic, as
350 : // double precision has decayed far enough at that point that the
351 : // usefulness of the cosine becomes limited.
352 : //
353 : // Explanation: The mantissa of a double-precision float has 52 bits
354 : // (excluding the implicit first bit, which is always one). If |q| >
355 : // 2^51, this implicit first bit has a place value of at least 2^51,
356 : // while the first stored bit of the mantissa has a place value of at
357 : // least 2^50. This means that the LSB of the mantissa has a place value
358 : // of at least 2^(-1), or 0.5. For a cos(pi*x), this corresponds to a
359 : // quarter of a cycle (pi/2), so at this point the precision of the
360 : // input argument is so low that going from one representable number to
361 : // the next causes the result to jump by +/-1.
362 :
363 : q = _mm256_add_pd(q, _mm256_set1_pd(0x0018000000000000));
364 :
365 : // Unfortunately, integer comparisons were only introduced in AVX2, so
366 : // we'll have to make do with a floating point comparison to check
367 : // whether the last bit is set. However, masking out all but the last
368 : // bit will result in a denormal float, which may either result in
369 : // performance problems or just be rounded down to zero, neither of
370 : // which is what we want here. To fix this, we'll mask in not only bit
371 : // 0, but also the exponent (and sign, but that doesn't matter) of q.
372 : // Luckily, the exponent of q is guaranteed to have the fixed value of
373 : // 1075 (corresponding to 2^52) after our addition.
374 :
375 : __m256d invert = _mm256_and_pd(
376 : q, _mm256_castsi256_pd(_mm256_set1_epi64x(0xfff0000000000001)));
377 :
378 : // If we did have a one in bit 0, our result will be equal to 2^52 + 1.
379 : invert = _mm256_cmp_pd(
380 : invert, _mm256_castsi256_pd(_mm256_set1_epi64x(0x4330000000000001)),
381 : _CMP_EQ_OQ);
382 :
383 : // Now we know whether to flip the sign of the result. However, remember
384 : // that we're working on multiple values at a time, so an if statement
385 : // won't be of much use here (plus it might perform badly). Instead,
386 : // we'll make use of the fact that the result of the comparison is all
387 : // ones if the comparison was true (i.e. q is odd and we need to flip
388 : // the result), and all zeroes otherwise. If we now mask out all bits
389 : // except the sign bit, we get something that, when xor'ed into our
390 : // final result, will flip the sign exactly when q is odd.
391 : invert = _mm256_and_pd(invert, _mm256_set1_pd(-0.0));
392 : // (Note that the binary representation of -0.0 is all 0 bits, except
393 : // for the sign bit, which is set to 1.)
394 :
395 : // TODO: clamp floats between 0 and 1? This would ensure that we never
396 : // see inf's, but maybe we want that, so that things dont just fail
397 : // silently...
398 :
399 : // * end of argument reduction
400 : // *******
401 :
402 : // ******
403 : // * Evaluate the cosine using a polynomial approximation for the zeroth
404 : // half-wave.
405 : // * The coefficients for this were generated using sleefs gencoef.c.
406 : // * These coefficients are probably far from optimal; however, they
407 : // should be sufficient for this case.
408 : s = _mm256_mul_pd(s, s);
409 :
410 : __m256d u = _mm256_set1_pd(+0.2211852080653743946e+0);
411 :
412 : u = _mm256_add_pd(_mm256_mul_pd(u, s),
413 : _mm256_set1_pd(-0.1332560668688523853e+1));
414 : u = _mm256_add_pd(_mm256_mul_pd(u, s),
415 : _mm256_set1_pd(+0.4058509506474178075e+1));
416 : u = _mm256_add_pd(_mm256_mul_pd(u, s),
417 : _mm256_set1_pd(-0.4934797516664651162e+1));
418 : u = _mm256_add_pd(_mm256_mul_pd(u, s), _mm256_set1_pd(1.));
419 :
420 : // Then, flip the sign of each double for which invert is not zero.
421 : // Since invert has only zero bits except for a possible one in bit 63,
422 : // we can xor it onto our result to selectively invert the 63rd (sign)
423 : // bit in each double where invert is set.
424 : u = _mm256_xor_pd(u, invert);
425 :
426 : // * end computation of cosine
427 : // **********
428 :
429 : // Finally, Ak*xi is multiplied on. Since this is a vector, the
430 : // multiplication needs to be done for each of the three
431 : // components, so it happens separately.
432 : acc0 = _mm256_add_pd(_mm256_mul_pd(u, Axi0), acc0);
433 : acc1 = _mm256_add_pd(_mm256_mul_pd(u, Axi1), acc1);
434 : acc2 = _mm256_add_pd(_mm256_mul_pd(u, Axi2), acc2);
435 : }
436 :
437 : return Vector3d(hsum_double_avx(acc0), hsum_double_avx(acc1),
438 : hsum_double_avx(acc2));
439 : #endif // ENABLE_FAST_WAVES
440 : }
441 :
442 : } // namespace crpropa
|