LCOV - code coverage report
Current view: top level - src/magneticField/turbulentField - PlaneWaveTurbulence.cpp (source / functions) Coverage Total Hit
Test: coverage.info.cleaned Lines: 97.9 % 47 46
Test Date: 2026-06-18 09:49:19 Functions: 100.0 % 2 2

            Line data    Source code
       1              : // This file contains an implementation of a vectorized cosine, which
       2              : // is based in part on the implementations in the library "SLEEF" by
       3              : // Naoki Shibata. SLEEF was used under the Boost Software License,
       4              : // Version 1.0. The original source file contained the following
       5              : // copyright notice:
       6              : //
       7              : //   //          Copyright Naoki Shibata 2010 - 2018.
       8              : //   // Distributed under the Boost Software License, Version 1.0.
       9              : //   //    (See accompanying file LICENSE.txt or copy at
      10              : //   //          http://www.boost.org/LICENSE_1_0.txt)
      11              : //
      12              : // SLEEF was used under the following license, which is not necessarily the
      13              : // license that applies to this file:
      14              : //
      15              : //         Boost Software License - Version 1.0 - August 17th, 2003
      16              : //
      17              : //         Permission is hereby granted, free of charge, to any person or
      18              : //         organization obtaining a copy of the software and accompanying
      19              : //         documentation covered by this license (the "Software") to use,
      20              : //         reproduce, display, distribute, execute, and transmit the Software,
      21              : //         and to prepare derivative works of the Software, and to permit
      22              : //         third-parties to whom the Software is furnished to do so, all subject
      23              : //         to the following:
      24              : //
      25              : //         The copyright notices in the Software and this entire statement,
      26              : //         including the above license grant, this restriction and the following
      27              : //         disclaimer, must be included in all copies of the Software, in whole
      28              : //         or in part, and all derivative works of the Software, unless such
      29              : //         copies or derivative works are solely in the form of
      30              : //         machine-executable object code generated by a source language
      31              : //         processor.
      32              : //
      33              : //         THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
      34              : //         EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
      35              : //         MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND
      36              : //         NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR ANYONE
      37              : //         DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR OTHER
      38              : //         LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
      39              : //         OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
      40              : //         THE SOFTWARE.
      41              : 
      42              : #include "crpropa/magneticField/turbulentField/PlaneWaveTurbulence.h"
      43              : #include "crpropa/GridTools.h"
      44              : #include "crpropa/Random.h"
      45              : #include "crpropa/Units.h"
      46              : 
      47              : #include "kiss/logger.h"
      48              : 
      49              : #include <iostream>
      50              : 
      51              : #if defined(FAST_WAVES)
      52              : #if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__) && defined(__SSE4_1__) && defined(__SSE4_2__) && defined(__AVX__)
      53              : #define ENABLE_FAST_WAVES
      54              : #else
      55              : #error "FAST_WAVES is enabled, but it appears that not all required SIMD extensions are enabled in the compiler. Without these extensions, the FAST_WAVES optimization cannot be used. Please make sure that the SIMD_EXTENSIONS option in cmake matches the capabilities of your target CPU, or (if your target CPU does not support the required extensions), disable the FAST_WAVES flag in cmake."
      56              : #endif
      57              : #endif
      58              : 
      59              : 
      60              : #ifdef ENABLE_FAST_WAVES
      61              : #include <immintrin.h>
      62              : #include <memory>
      63              : #endif
      64              : 
      65              : namespace crpropa {
      66              : #ifdef ENABLE_FAST_WAVES
      67              : // see
      68              : // https://stackoverflow.com/questions/49941645/get-sum-of-values-stored-in-m256d-with-sse-avx
      69              : double hsum_double_avx(__m256d v) {
      70              :         __m128d vlow = _mm256_castpd256_pd128(v);
      71              :         __m128d vhigh = _mm256_extractf128_pd(v, 1); // high 128
      72              :         vlow = _mm_add_pd(vlow, vhigh);              // reduce down to 128
      73              : 
      74              :         __m128d high64 = _mm_unpackhi_pd(vlow, vlow);
      75              :         return _mm_cvtsd_f64(_mm_add_sd(vlow, high64)); // reduce to scalar
      76              : }
      77              : #endif // defined(ENABLE_FAST_WAVES)
      78              : 
      79            2 : PlaneWaveTurbulence::PlaneWaveTurbulence(const TurbulenceSpectrum &spectrum,
      80            2 :                                          int Nm, int seed)
      81            2 :     : TurbulentField(spectrum), Nm(Nm) {
      82              : 
      83              : #ifdef ENABLE_FAST_WAVES
      84              :         KISS_LOG_INFO << "PlaneWaveTurbulence: Using SIMD TD13 implementation"
      85              :                       << std::endl;
      86              : 
      87              :         // There used to be a cpuid check here, to see if the cpu running
      88              :         // this code would support SIMD (SSE + AVX). However, the library providing
      89              :         // the relevant function is no longer being used, and doing this manually
      90              :         // might be a bit too much work.
      91              : #endif
      92              : 
      93            2 :         if (Nm <= 1) {
      94              :                 throw std::runtime_error(
      95              :                     "PlaneWaveTurbulence: Nm <= 1. Specify at least two wavemodes in "
      96            0 :                     "order to generate the k distribution properly.");
      97              :         }
      98              : 
      99            2 :         Random random;
     100            2 :         if (seed != 0)
     101            2 :                 random.seed(seed);
     102              : 
     103            2 :         double kmax = 2 * M_PI / spectrum.getLmin();
     104            2 :         double kmin = 2 * M_PI / spectrum.getLmax();
     105              : 
     106            2 :         xi = std::vector<Vector3d>(Nm, Vector3d(0.));
     107            2 :         kappa = std::vector<Vector3d>(Nm, Vector3d(0.));
     108            2 :         phi = std::vector<double>(Nm, 0.);
     109            2 :         costheta = std::vector<double>(Nm, 0.);
     110            2 :         beta = std::vector<double>(Nm, 0.);
     111            2 :         Ak = std::vector<double>(Nm, 0.);
     112            2 :         k = std::vector<double>(Nm, 0.);
     113              : 
     114            2 :         double delta = log10(kmax / kmin);
     115           22 :         for (int i = 0; i < Nm; i++) {
     116           20 :                 k[i] = pow(10, log10(kmin) + ((double)i) / ((double)(Nm - 1)) * delta);
     117              :         }
     118              : 
     119              :         // * compute Ak *
     120              : 
     121              :         double delta_k0 =
     122            2 :             (k[1] - k[0]) / k[1]; // multiply this by k[i] to get delta_k[i]
     123              :         // Note: this is probably unnecessary since it's just a factor
     124              :         // and will get normalized out anyways. It's not like this is
     125              :         // performance-sensitive code though, and I don't want to change
     126              :         // anything numerical now.
     127              : 
     128              :         // For this loop, the Ak array actually contains Gk*delta_k (ie
     129              :         // non-normalized Ak^2). Normalization happens in a second loop,
     130              :         // once the total is known.
     131              :         double Ak2_sum = 0; // sum of Ak^2 over all k
     132           22 :         for (int i = 0; i < Nm; i++) {
     133           20 :                 double k = this->k[i];
     134           20 :                 double kHat = k * spectrum.getLbendover();
     135           20 :                 double Gk = spectrum.energySpectrum(k) * (1 + kHat * kHat);     // correct different implementation in TD 13 (eq. 5, missing + 1 in the denuminators exponent)
     136           20 :                 Ak[i] = Gk * delta_k0 * k;
     137           20 :                 Ak2_sum += Ak[i];
     138              : 
     139              :                 // phi, costheta, and sintheta are for drawing vectors with
     140              :                 // uniform distribution on the unit sphere.
     141              :                 // This is similar to Random::randVector(): their t is our phi,
     142              :                 // z is costheta, and r is sintheta. Our kappa is equivalent to
     143              :                 // the return value of randVector(); however, TD13 then reuse
     144              :                 // these values to generate a random vector perpendicular to kappa.
     145           20 :                 double phi = random.randUniform(-M_PI, M_PI);
     146           20 :                 double costheta = random.randUniform(-1., 1.);
     147           20 :                 double sintheta = sqrt(1 - costheta * costheta);
     148              : 
     149           20 :                 double alpha = random.randUniform(0, 2 * M_PI);
     150           20 :                 double beta = random.randUniform(0, 2 * M_PI);
     151              : 
     152              :                 Vector3d kappa =
     153           20 :                     Vector3d(sintheta * cos(phi), sintheta * sin(phi), costheta);
     154              : 
     155              :                 // NOTE: all other variable names match the ones from the TD13 paper.
     156              :                 // However, our xi is actually their psi, and their xi isn't used at
     157              :                 // all. (Though both can be used for the polarization vector, according
     158              :                 // to the paper.) The reason for this discrepancy is that this code
     159              :                 // used to be based on the original GJ99 paper, which provided only a
     160              :                 // xi vector, and this xi happens to be almost the same as TD13's psi.
     161              :                 Vector3d xi =
     162           20 :                     Vector3d(costheta * cos(phi) * cos(alpha) + sin(phi) * sin(alpha),
     163           20 :                              costheta * sin(phi) * cos(alpha) - cos(phi) * sin(alpha),
     164           20 :                              -sintheta * cos(alpha));
     165              : 
     166              :                 this->xi[i] = xi;
     167              :                 this->kappa[i] = kappa;
     168           20 :                 this->phi[i] = phi;
     169           20 :                 this->costheta[i] = costheta;
     170           20 :                 this->beta[i] = beta;
     171              :         }
     172              : 
     173              :         // Only in this loop are the actual Ak computed and stored.
     174              :         // This two-step process is necessary in order to normalize the values
     175              :         // properly.
     176           22 :         for (int i = 0; i < Nm; i++) {
     177           20 :                 Ak[i] = sqrt(2 * Ak[i] / Ak2_sum) * spectrum.getBrms();
     178              :         }
     179              : 
     180              : #ifdef ENABLE_FAST_WAVES
     181              :         // * copy data into AVX-compatible arrays *
     182              :         //
     183              :         // AVX requires all data to be aligned to 256 bit, or 32 bytes, which is the
     184              :         // same as 4 double precision floating point numbers. Since support for
     185              :         // alignments this big seems to be somewhat tentative in C++ allocators,
     186              :         // we're aligning them manually by allocating a normal double array, and
     187              :         // then computing the offset to the first value with the correct alignment.
     188              :         // This is a little bit of work, so instead of doing it separately for each
     189              :         // of the individual data arrays, we're doing it once for one big array that
     190              :         // all of the component arrays get packed into.
     191              :         //
     192              :         // The other thing to keep in mind is that AVX always reads in units of 256
     193              :         // bits, or 4 doubles. This means that our number of wavemodes must be
     194              :         // divisible by 4. If it isn't, we simply pad it out with zeros. Since the
     195              :         // final step of the computation of each wavemode is multiplication by the
     196              :         // amplitude, which will be set to 0, these padding wavemodes won't affect
     197              :         // the result.
     198              : 
     199              :         avx_Nm = ((Nm + 4 - 1) / 4) * 4; // round up to next larger multiple of 4:
     200              :                                          // align is 256 = 4 * sizeof(double) bit
     201              :         avx_data = std::vector<double>(itotal * avx_Nm + 3, 0.);
     202              : 
     203              :         // get the first 256-bit aligned element
     204              :         size_t size = avx_data.size() * sizeof(double);
     205              :         void *pointer = avx_data.data();
     206              :         align_offset =
     207              :             (double *)std::align(32, 32, pointer, size) - avx_data.data();
     208              : 
     209              :         // copy into the AVX arrays
     210              :         for (int i = 0; i < Nm; i++) {
     211              :                 avx_data[i + align_offset + avx_Nm * iAxi0] = Ak[i] * xi[i].x;
     212              :                 avx_data[i + align_offset + avx_Nm * iAxi1] = Ak[i] * xi[i].y;
     213              :                 avx_data[i + align_offset + avx_Nm * iAxi2] = Ak[i] * xi[i].z;
     214              : 
     215              :                 // The cosine implementation computes cos(pi*x), so we'll divide out the
     216              :                 // pi here.
     217              :                 avx_data[i + align_offset + avx_Nm * ikkappa0] =
     218              :                     k[i] / M_PI * kappa[i].x;
     219              :                 avx_data[i + align_offset + avx_Nm * ikkappa1] =
     220              :                     k[i] / M_PI * kappa[i].y;
     221              :                 avx_data[i + align_offset + avx_Nm * ikkappa2] =
     222              :                     k[i] / M_PI * kappa[i].z;
     223              : 
     224              :                 // We also need to divide beta by pi, since that goes into the argument
     225              :                 // of the cosine as well.
     226              :                 avx_data[i + align_offset + avx_Nm * ibeta] = beta[i] / M_PI;
     227              :         }
     228              : #endif // ENABLE_FAST_WAVES
     229            2 : }
     230              : 
     231           45 : Vector3d PlaneWaveTurbulence::getField(const Vector3d &pos) const {
     232              : 
     233              : #ifndef ENABLE_FAST_WAVES
     234              :         Vector3d B(0.);
     235          495 :         for (int i = 0; i < Nm; i++) {
     236          450 :                 double z_ = pos.dot(kappa[i]);
     237          450 :                 B += xi[i] * Ak[i] * cos(k[i] * z_ + beta[i]);
     238              :         }
     239           45 :         return B;
     240              : 
     241              : #else  // ENABLE_FAST_WAVES
     242              : 
     243              :         // Initialize accumulators
     244              :         //
     245              :         // There is one accumulator per component of the result vector.
     246              :         // Note that each accumulator contains four numbers. At the end of
     247              :         // the loop, each of these numbers will contain the sum of every
     248              :         // fourth wavemode, starting at a different offset. In the end, each
     249              :         // of the accumulator's numbers are added together (using
     250              :         // hsum_double_avx), resulting in the total sum for that component.
     251              : 
     252              :         __m256d acc0 = _mm256_setzero_pd();
     253              :         __m256d acc1 = _mm256_setzero_pd();
     254              :         __m256d acc2 = _mm256_setzero_pd();
     255              : 
     256              :         // broadcast position into AVX registers
     257              :         __m256d pos0 = _mm256_set1_pd(pos.x);
     258              :         __m256d pos1 = _mm256_set1_pd(pos.y);
     259              :         __m256d pos2 = _mm256_set1_pd(pos.z);
     260              : 
     261              :         for (int i = 0; i < avx_Nm; i += 4) {
     262              : 
     263              :                 // Load data from memory into AVX registers:
     264              :                 //  - the three components of the vector A * xi
     265              :                 __m256d Axi0 =
     266              :                     _mm256_load_pd(avx_data.data() + i + align_offset + avx_Nm * iAxi0);
     267              :                 __m256d Axi1 =
     268              :                     _mm256_load_pd(avx_data.data() + i + align_offset + avx_Nm * iAxi1);
     269              :                 __m256d Axi2 =
     270              :                     _mm256_load_pd(avx_data.data() + i + align_offset + avx_Nm * iAxi2);
     271              : 
     272              :                 //  - the three components of the vector k * kappa
     273              :                 __m256d kkappa0 = _mm256_load_pd(avx_data.data() + i + align_offset +
     274              :                                                  avx_Nm * ikkappa0);
     275              :                 __m256d kkappa1 = _mm256_load_pd(avx_data.data() + i + align_offset +
     276              :                                                  avx_Nm * ikkappa1);
     277              :                 __m256d kkappa2 = _mm256_load_pd(avx_data.data() + i + align_offset +
     278              :                                                  avx_Nm * ikkappa2);
     279              : 
     280              :                 //  - the phase beta.
     281              :                 __m256d beta =
     282              :                     _mm256_load_pd(avx_data.data() + i + align_offset + avx_Nm * ibeta);
     283              : 
     284              :                 // Then, do the computation.
     285              : 
     286              :                 // This is the scalar product between k*kappa and pos:
     287              :                 __m256d z = _mm256_add_pd(_mm256_mul_pd(pos0, kkappa0),
     288              :                                           _mm256_add_pd(_mm256_mul_pd(pos1, kkappa1),
     289              :                                                         _mm256_mul_pd(pos2, kkappa2)));
     290              : 
     291              :                 // Here, the phase is added on. This is the argument of the cosine.
     292              :                 __m256d cos_arg = _mm256_add_pd(z, beta);
     293              : 
     294              :                 // ********
     295              :                 // * Computing the cosine
     296              :                 // * Part 1: Argument reduction
     297              :                 //
     298              :                 //  To understand the computation of the cosine, first note that the
     299              :                 //  cosine is periodic and we thus only need to model its behavior
     300              :                 //  between 0 and 2*pi to be able compute the function anywhere. In
     301              :                 //  fact, by mirroring the function along the x and y axes, even the
     302              :                 //  range between 0 and pi/2 is sufficient for this purpose. In this
     303              :                 //  range, the cosine can be efficiently evaluated with high precision
     304              :                 //  by using a polynomial approximation. Thus, to compute the cosine,
     305              :                 //  the input value is first reduced so that it lies within this range.
     306              :                 //  Then, the polynomial approximation is evaluated. Finally, if
     307              :                 //  necessary, the sign of the result is flipped (mirroring the function
     308              :                 //  along the x axis).
     309              :                 //
     310              :                 //  The actual computation is slightly more involved. First, argument
     311              :                 //  reduction can be simplified drastically by computing cos(pi*x),
     312              :                 //  such that the values are reduced to the range [0, 0.5) instead of
     313              :                 //  [0, pi/2). Since the cosine is even (independent of the sign), we
     314              :                 //  can first reduce values to [-0.5, 0.5) – that is, a simple rounding
     315              :                 //  operation – and then neutralize the sign. In fact, precisely because
     316              :                 //  the cosine is even, all terms of the polynomial are powers of x^2,
     317              :                 //  so the value of x^2 (computed as x*x) forms the basis for the
     318              :                 //  polynomial approximation. If I understand things correctly, then (in
     319              :                 //  IEEE-754 floating point) x*x and (-x)*(-x) will always result in the
     320              :                 //  exact same value, which means that any error bound over [0, 0.5)
     321              :                 //  automatically applies to (-0.5, 0] as well.
     322              : 
     323              :                 // First, compute round(x), and store it in q. If this value is odd,
     324              :                 // we're looking at the negative half-wave of the cosine, and thus
     325              :                 // will have to invert the sign of the result.
     326              :                 __m256d q = _mm256_round_pd(
     327              :                     cos_arg, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
     328              : 
     329              :                 // Since we're computing cos(pi*x), round(x) always yields the center of
     330              :                 // a half-wave (where cos(pi*x) achieves an extremum). This point
     331              :                 // logically corresponds to x=0. Therefore, we subtract this center from
     332              :                 // the actual input argument to find the corresponding point on the
     333              :                 // half-wave that is centered around zero.
     334              :                 __m256d s = _mm256_sub_pd(cos_arg, q);
     335              : 
     336              :                 // We now want to check whether q (the index of our half-wave) is even
     337              :                 // or odd, since all of the odd-numbered half-waves are negative, so
     338              :                 // we'll have to flip the final result. On an int, this is as simple as
     339              :                 // checking the 0th bit. Idea: manipulate the double in such a way that
     340              :                 // we can do this. So, we add 2^52, such that the last digit of the
     341              :                 // mantissa is actually in the ones' position. Since q may be negative,
     342              :                 // we'll also add 2^51 to make sure it's positive. Note that 2^51 is
     343              :                 // even and thus leaves evenness invariant, which is the only thing we
     344              :                 // care about here.
     345              :                 //
     346              :                 // This is based on the int extraction process described here:
     347              :                 // https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx/41223013
     348              :                 //
     349              :                 // We assume -2^51 <= q < 2^51 for this, which is unproblematic, as
     350              :                 // double precision has decayed far enough at that point that the
     351              :                 // usefulness of the cosine becomes limited.
     352              :                 //
     353              :                 // Explanation: The mantissa of a double-precision float has 52 bits
     354              :                 // (excluding the implicit first bit, which is always one). If |q| >
     355              :                 // 2^51, this implicit first bit has a place value of at least 2^51,
     356              :                 // while the first stored bit of the mantissa has a place value of at
     357              :                 // least 2^50. This means that the LSB of the mantissa has a place value
     358              :                 // of at least 2^(-1), or 0.5. For a cos(pi*x), this corresponds to a
     359              :                 // quarter of a cycle (pi/2), so at this point the precision of the
     360              :                 // input argument is so low that going from one representable number to
     361              :                 // the next causes the result to jump by +/-1.
     362              : 
     363              :                 q = _mm256_add_pd(q, _mm256_set1_pd(0x0018000000000000));
     364              : 
     365              :                 // Unfortunately, integer comparisons were only introduced in AVX2, so
     366              :                 // we'll have to make do with a floating point comparison to check
     367              :                 // whether the last bit is set. However, masking out all but the last
     368              :                 // bit will result in a denormal float, which may either result in
     369              :                 // performance problems or just be rounded down to zero, neither of
     370              :                 // which is what we want here. To fix this, we'll mask in not only bit
     371              :                 // 0, but also the exponent (and sign, but that doesn't matter) of q.
     372              :                 // Luckily, the exponent of q is guaranteed to have the fixed value of
     373              :                 // 1075 (corresponding to 2^52) after our addition.
     374              : 
     375              :                 __m256d invert = _mm256_and_pd(
     376              :                     q, _mm256_castsi256_pd(_mm256_set1_epi64x(0xfff0000000000001)));
     377              : 
     378              :                 // If we did have a one in bit 0, our result will be equal to 2^52 + 1.
     379              :                 invert = _mm256_cmp_pd(
     380              :                     invert, _mm256_castsi256_pd(_mm256_set1_epi64x(0x4330000000000001)),
     381              :                     _CMP_EQ_OQ);
     382              : 
     383              :                 // Now we know whether to flip the sign of the result. However, remember
     384              :                 // that we're working on multiple values at a time, so an if statement
     385              :                 // won't be of much use here (plus it might perform badly). Instead,
     386              :                 // we'll make use of the fact that the result of the comparison is all
     387              :                 // ones if the comparison was true (i.e. q is odd and we need to flip
     388              :                 // the result), and all zeroes otherwise. If we now mask out all bits
     389              :                 // except the sign bit, we get something that, when xor'ed into our
     390              :                 // final result, will flip the sign exactly when q is odd.
     391              :                 invert = _mm256_and_pd(invert, _mm256_set1_pd(-0.0));
     392              :                 // (Note that the binary representation of -0.0 is all 0 bits, except
     393              :                 // for the sign bit, which is set to 1.)
     394              : 
     395              :                 // TODO: clamp floats between 0 and 1? This would ensure that we never
     396              :                 // see inf's, but maybe we want that, so that things dont just fail
     397              :                 // silently...
     398              : 
     399              :                 // * end of argument reduction
     400              :                 // *******
     401              : 
     402              :                 // ******
     403              :                 // * Evaluate the cosine using a polynomial approximation for the zeroth
     404              :                 // half-wave.
     405              :                 // * The coefficients for this were generated using sleefs gencoef.c.
     406              :                 // * These coefficients are probably far from optimal; however, they
     407              :                 // should be sufficient for this case.
     408              :                 s = _mm256_mul_pd(s, s);
     409              : 
     410              :                 __m256d u = _mm256_set1_pd(+0.2211852080653743946e+0);
     411              : 
     412              :                 u = _mm256_add_pd(_mm256_mul_pd(u, s),
     413              :                                   _mm256_set1_pd(-0.1332560668688523853e+1));
     414              :                 u = _mm256_add_pd(_mm256_mul_pd(u, s),
     415              :                                   _mm256_set1_pd(+0.4058509506474178075e+1));
     416              :                 u = _mm256_add_pd(_mm256_mul_pd(u, s),
     417              :                                   _mm256_set1_pd(-0.4934797516664651162e+1));
     418              :                 u = _mm256_add_pd(_mm256_mul_pd(u, s), _mm256_set1_pd(1.));
     419              : 
     420              :                 // Then, flip the sign of each double for which invert is not zero.
     421              :                 // Since invert has only zero bits except for a possible one in bit 63,
     422              :                 // we can xor it onto our result to selectively invert the 63rd (sign)
     423              :                 // bit in each double where invert is set.
     424              :                 u = _mm256_xor_pd(u, invert);
     425              : 
     426              :                 // * end computation of cosine
     427              :                 // **********
     428              : 
     429              :                 // Finally, Ak*xi is multiplied on. Since this is a vector, the
     430              :                 // multiplication needs to be done for each of the three
     431              :                 // components, so it happens separately.
     432              :                 acc0 = _mm256_add_pd(_mm256_mul_pd(u, Axi0), acc0);
     433              :                 acc1 = _mm256_add_pd(_mm256_mul_pd(u, Axi1), acc1);
     434              :                 acc2 = _mm256_add_pd(_mm256_mul_pd(u, Axi2), acc2);
     435              :         }
     436              : 
     437              :         return Vector3d(hsum_double_avx(acc0), hsum_double_avx(acc1),
     438              :                         hsum_double_avx(acc2));
     439              : #endif // ENABLE_FAST_WAVES
     440              : }
     441              : 
     442              : } // namespace crpropa
        

Generated by: LCOV version 2.0-1