Tissue Forge C++ 0.2.1
Interactive, particle-based physics, chemistry and biology modeling and simulation environment
Loading...
Searching...
No Matches
tf_potential_eval.h
1/*******************************************************************************
2 * This file is part of mdcore.
3 * Coypright (c) 2010 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
4 * Coypright (c) 2017 Andy Somogyi (somogyie at indiana dot edu)
5 * Copyright (c) 2022-2024 T.J. Sego
6 *
7 * This program is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as published
9 * by the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with this program. If not, see <http://www.gnu.org/licenses/>.
19 *
20 ******************************************************************************/
21
22#ifndef _MDCORE_SOURCE_TF_POTENTIAL_EVAL_H_
23#define _MDCORE_SOURCE_TF_POTENTIAL_EVAL_H_
24
25#include <tfPotential.h>
26#include <tfEngine.h>
27#include "tf_dpd_eval.h"
28#include <tfDPDPotential.h>
29
30/* This file contains the potential evaluation function als "extern inline",
31 such that they can be inlined in the respective modules.
32
33 If your code wants to call any potential_eval functions, you must include
34 this file.
35*/
36
37#include <iostream>
38
39
40namespace TissueForge {
41
42
43 TF_ALWAYS_INLINE FPTYPE potential_eval_adjust_distance(struct Potential *p, FPTYPE ri, FPTYPE rj, FPTYPE r) {
44 if(p->flags & POTENTIAL_SCALED) {
45 r = r / (ri + rj);
46 }
47 else if(p->flags & POTENTIAL_SHIFTED) {
48 r = r - (ri + rj) + p->r0_plusone;
49 }
50 return r;
51 }
52
53 TF_ALWAYS_INLINE FPTYPE potential_eval_adjust_distance2(struct Potential *p, FPTYPE ri, FPTYPE rj, FPTYPE r2) {
54 FPTYPE r = potential_eval_adjust_distance(p, ri, rj, FPTYPE_SQRT(r2));
55 return r * r;
56 }
57
58
74 TF_ALWAYS_INLINE void potential_eval(struct Potential *p, FPTYPE r2, FPTYPE *e, FPTYPE *f) {
75
76 int ind, k;
77 FPTYPE x, ee, eff, *c, r;
78
79 /* Get r for the right type. */
80 r = FPTYPE_SQRT(r2);
81
82 /* compute the index */
83 ind = FPTYPE_FMAX(FPTYPE_ZERO, p->alpha[0] + r * (p->alpha[1] + r * p->alpha[2]));
84
85 /* get the table offset */
86 c = &(p->c[ind * potential_chunk]);
87
88 /* adjust x to the interval */
89 x = (r - c[0]) * c[1];
90
91 /* compute the potential and its derivative */
92 ee = c[2] * x + c[3];
93 eff = c[2];
94 for(k = 4 ; k < potential_chunk ; k++) {
95 eff = eff * x + ee;
96 ee = ee * x + c[k];
97 }
98
99 /* store the result */
100 *e = ee; *f = eff * c[1] / r;
101
102 }
103
104 TF_ALWAYS_INLINE bool potential_eval_ex(
105 struct Potential *p, FPTYPE ri, FPTYPE rj, FPTYPE r2, FPTYPE *e, FPTYPE *f) {
106
107 unsigned ind, k;
108 FPTYPE x, ee, eff, *c, r, ro;
109
110 static const FPTYPE epsilon = std::numeric_limits<FPTYPE>::epsilon();
111
112 /* Get r for the right type. */
113 r = FPTYPE_SQRT(r2);
114 ro = r < epsilon ? epsilon : r;
115
116 r = potential_eval_adjust_distance(p, ri, rj, r);
117
118 // cutoff min value, eval at lowest func interpolation.
119 r = r < p->a ? p->a : r;
120
121 /* compute the index */
122 ind = std::max(FPTYPE_ZERO, p->alpha[0] + r * (p->alpha[1] + r * p->alpha[2]));
123
124 if(r > p->b || ind > p->n) {
125 return false;
126 }
127
128 /* get the table offset */
129 c = &(p->c[ind * potential_chunk]);
130
131 /* adjust x to the interval */
132 x = (r - c[0]) * c[1];
133
134 /* compute the potential and its derivative */
135 ee = c[2] * x + c[3];
136 eff = c[2];
137 for(k = 4 ; k < potential_chunk ; k++) {
138 eff = eff * x + ee;
139 ee = ee * x + c[k];
140 }
141
142 /* store the result */
143 *e = ee; *f = eff * c[1] / ro;
144
145 return true;
146 }
147
148
149
165 TF_ALWAYS_INLINE void potential_eval_r (struct Potential *p, FPTYPE r, FPTYPE *e, FPTYPE *f) {
166
167 int ind, k;
168 FPTYPE x, ee, eff, *c;
169
170 /* compute the index */
171 ind = FPTYPE_FMAX(FPTYPE_ZERO, p->alpha[0] + r * (p->alpha[1] + r * p->alpha[2]));
172
173 if(ind > p->n) {
174 return;
175 }
176
177 /* get the table offset */
178 c = &(p->c[ind * potential_chunk]);
179
180 /* adjust x to the interval */
181 x = (r - c[0]) * c[1];
182
183 /* compute the potential and its derivative */
184 ee = c[2] * x + c[3];
185 eff = c[2];
186 for(k = 4 ; k < potential_chunk ; k++) {
187 eff = eff * x + ee;
188 ee = ee * x + c[k];
189 }
190
191 /* store the result */
192 *e = ee; *f = eff * c[1];
193
194 }
195
196 TF_ALWAYS_INLINE bool potential_eval_super_ex(const space_cell *cell,
197 Potential *pot, Particle *part_i, Particle *part_j,
198 FPTYPE *dx, FPTYPE r2, FPTYPE *epot);
199
200 static bool _potential_eval_super_ex(const space_cell *cell,
201 Potential *pot, Particle *part_i, Particle *part_j,
202 FPTYPE *dx, FPTYPE r2, FPTYPE *epot)
203 {
204 return potential_eval_super_ex(cell, pot, part_i, part_j, dx, r2, epot);
205 }
206
207 bool potential_eval_super_ex(const space_cell *cell,
208 Potential *pot, Particle *part_i, Particle *part_j,
209 FPTYPE *dx, FPTYPE r2, FPTYPE *epot) {
210
211 if(pot->kind == POTENTIAL_KIND_COMBINATION) {
212 if(pot->flags & POTENTIAL_SUM) {
213 return _potential_eval_super_ex(cell, pot->pca, part_i, part_j, dx, r2, epot) || _potential_eval_super_ex(cell, pot->pcb, part_i, part_j, dx, r2, epot);
214 }
215 }
216
217 FPTYPE e;
218 bool result = false;
219 FPTYPE _dx[3], _r2;
220
221 if(pot->flags & POTENTIAL_PERIODIC) {
222 // Assuming elsewhere there's a corresponding potential in the opposite direction
223 _r2 = fptype_r2(dx, pot->offset, _dx);
224 }
225 else {
226 _r2 = r2;
227 for (int k = 0; k < 3; k++) _dx[k] = dx[k];
228 }
229
230 if(pot->kind == POTENTIAL_KIND_DPD) {
231 /* update the forces if part in range */
232 if (dpd_eval((DPDPotential*)pot, space_cell_gaussian(cell->id), part_i, part_j, _dx, _r2, &e)) {
233
234 /* tabulate the energy */
235 *epot += e;
236 result = true;
237 }
238 }
239 else if(pot->kind == POTENTIAL_KIND_BYPARTICLES) {
240 FPTYPE fv[3] = {0., 0., 0.};
241
242 pot->eval_byparts(pot, part_i, part_j, _dx, _r2, &e, fv);
243
244 for (int k = 0 ; k < 3 ; k++) {
245 part_i->f[k] += fv[k];
246 part_j->f[k] -= fv[k];
247 }
248
249 /* tabulate the energy */
250 *epot += e;
251 result = true;
252 }
253 else if(pot->kind != POTENTIAL_KIND_COMBINATION) {
254 FPTYPE f;
255
256 /* update the forces if part in range */
257 if (potential_eval_ex(pot, part_i->radius, part_j->radius, _r2, &e, &f)) {
258
259 for (int k = 0 ; k < 3 ; k++) {
260 FPTYPE w = f * _dx[k];
261 part_i->f[k] -= w;
262 part_j->f[k] += w;
263 }
264
265 /* tabulate the energy */
266 *epot += e;
267 result = true;
268 }
269 }
270
271 return result;
272 }
273
274
296 TF_ALWAYS_INLINE void potential_eval_expl(struct Potential *p, FPTYPE r2, FPTYPE *e, FPTYPE *f) {
297
298 const FPTYPE isqrtpi = 0.56418958354775628695;
299 const FPTYPE kappa = 3.0;
300 FPTYPE r = sqrt(r2), ir = 1.0 / r, ir2 = ir * ir, ir4, ir6, ir12, t1, t2;
301 FPTYPE ee = 0.0, eff = 0.0;
302
303 /* Do we have a Lennard-Jones interaction? */
304 if(p->flags & POTENTIAL_LJ126) {
305
306 /* init some variables */
307 ir4 = ir2 * ir2; ir6 = ir4 * ir2; ir12 = ir6 * ir6;
308
309 /* compute the energy and the force */
310 ee = (p->alpha[0] * ir12 - p->alpha[1] * ir6);
311 eff = 6.0 * ir * (-2.0 * p->alpha[0] * ir12 + p->alpha[1] * ir6);
312
313 }
314
315 /* Do we have an Ewald short-range part? */
316 if(p->flags & POTENTIAL_EWALD) {
317
318 /* get some values we will re-use */
319 t2 = r * kappa;
320 t1 = erfc(t2);
321
322 /* compute the energy and the force */
323 ee += p->alpha[2] * t1 * ir;
324 eff += p->alpha[2] * (-2.0 * isqrtpi * exp(-t2 * t2) * kappa * ir - t1 * ir2);
325
326 }
327
328 /* Do we have a Coulomb interaction? */
329 if(p->flags & POTENTIAL_COULOMB) {
330
331 /* compute the energy and the force */
332 ee += potential_escale * p->alpha[2] * ir;
333 eff += -potential_escale * p->alpha[2] * ir2;
334
335 }
336
337 /* store the potential and force. */
338 *e = ee;
339 *f = eff;
340
341 }
342
343
367 TF_ALWAYS_INLINE void potential_eval_vec_4single(struct Potential *p[4], float *r2, float *e, float *f) {
368
369 #if defined(__SSE__) && defined(FPTYPE_SINGLE)
370 // int j, k;
371 union {
372 __v4sf v;
373 __m128i m;
374 float f[4];
375 int i[4];
376 } alpha[4], mi, hi, x, ee, eff, c[6], r, ind, t[8];
377 // float *data[4];
378
379 /* Get r . */
380 r.v = _mm_sqrt_ps(_mm_load_ps(r2));
381
382 /* compute the index */
383 alpha[0].v = _mm_load_ps(p[0]->alpha);
384 alpha[1].v = _mm_load_ps(p[1]->alpha);
385 alpha[2].v = _mm_load_ps(p[2]->alpha);
386 alpha[3].v = _mm_load_ps(p[3]->alpha);
387 t[0].m = _mm_unpacklo_epi32(alpha[0].m, alpha[1].m);
388 t[1].m = _mm_unpacklo_epi32(alpha[2].m, alpha[3].m);
389 t[2].m = _mm_unpackhi_epi32(alpha[0].m, alpha[1].m);
390 t[3].m = _mm_unpackhi_epi32(alpha[2].m, alpha[3].m);
391 alpha[0].m = _mm_unpacklo_epi64(t[0].m, t[1].m);
392 alpha[1].m = _mm_unpackhi_epi64(t[0].m, t[1].m);
393 alpha[2].m = _mm_unpacklo_epi64(t[2].m, t[3].m);
394 ind.m = _mm_cvttps_epi32(_mm_max_ps(_mm_setzero_ps(), _mm_add_ps(alpha[0].v, _mm_mul_ps(r.v, _mm_add_ps(alpha[1].v, _mm_mul_ps(r.v, alpha[2].v))))));
395
396 /* Unpack/transpose the coefficient data. */
397 mi.v = _mm_load_ps(&p[0]->c[ ind.i[0] * potential_chunk ]);
398 hi.v = _mm_load_ps(&p[1]->c[ ind.i[1] * potential_chunk ]);
399 c[0].v = _mm_load_ps(&p[2]->c[ ind.i[2] * potential_chunk ]);
400 c[1].v = _mm_load_ps(&p[3]->c[ ind.i[3] * potential_chunk ]);
401 _MM_TRANSPOSE4_PS(mi.v, hi.v, c[0].v, c[1].v);
402 c[2].v = _mm_load_ps(&p[0]->c[ ind.i[0] * potential_chunk + 4 ]);
403 c[3].v = _mm_load_ps(&p[1]->c[ ind.i[1] * potential_chunk + 4 ]);
404 c[4].v = _mm_load_ps(&p[2]->c[ ind.i[2] * potential_chunk + 4 ]);
405 c[5].v = _mm_load_ps(&p[3]->c[ ind.i[3] * potential_chunk + 4 ]);
406 _MM_TRANSPOSE4_PS(c[2].v, c[3].v, c[4].v, c[5].v);
407
408 /* adjust x to the interval */
409 x.v = _mm_mul_ps(_mm_sub_ps(r.v, mi.v), hi.v);
410
411 /* compute the potential and its derivative */
412 eff.v = c[0].v;
413 ee.v = _mm_add_ps(_mm_mul_ps(eff.v, x.v), c[1].v);
414 eff.v = _mm_add_ps(_mm_mul_ps(eff.v, x.v), ee.v);
415 ee.v = _mm_add_ps(_mm_mul_ps(ee.v, x.v), c[2].v);
416 eff.v = _mm_add_ps(_mm_mul_ps(eff.v, x.v), ee.v);
417 ee.v = _mm_add_ps(_mm_mul_ps(ee.v, x.v), c[3].v);
418 eff.v = _mm_add_ps(_mm_mul_ps(eff.v, x.v), ee.v);
419 ee.v = _mm_add_ps(_mm_mul_ps(ee.v, x.v), c[4].v);
420 eff.v = _mm_add_ps(_mm_mul_ps(eff.v, x.v), ee.v);
421 ee.v = _mm_add_ps(_mm_mul_ps(ee.v, x.v), c[5].v);
422
423 /* store the result */
424 _mm_store_ps(e, ee.v);
425 _mm_store_ps(f, _mm_mul_ps(eff.v, _mm_div_ps(hi.v, r.v)));
426
427 #elif defined(__ALTIVEC__) && defined(FPTYPE_SINGLE)
428 int j, k;
429 union {
430 vector float v;
431 float f[4];
432 } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r;
433 union {
434 vector unsigned int v;
435 unsigned int i[4];
436 } ind;
437 float *data[4];
438
439 /* Get r . */
440 r.v = vec_sqrt(*((vector float *)r2));
441
442 /* compute the index (vec_ctu maps negative floats to 0) */
443 alpha0.v = vec_load4(p[0]->alpha[0], p[1]->alpha[0], p[2]->alpha[0], p[3]->alpha[0]);
444 alpha1.v = vec_load4(p[0]->alpha[1], p[1]->alpha[1], p[2]->alpha[1], p[3]->alpha[1]);
445 alpha2.v = vec_load4(p[0]->alpha[2], p[1]->alpha[2], p[2]->alpha[2], p[3]->alpha[2]);
446 ind.v = vec_ctu(vec_madd(r.v, vec_madd(r.v, alpha2.v, alpha1.v), alpha0.v), 0);
447
448 /* get the table offset */
449 for(k = 0 ; k < 4 ; k++)
450 data[k] = &(p[k]->c[ ind.i[k] * potential_chunk ]);
451
452 /* adjust x to the interval */
453 mi.v = vec_load4(data[0][0], data[1][0], data[2][0], data[3][0]);
454 hi.v = vec_load4(data[0][1], data[1][1], data[2][1], data[3][1]);
455 x.v = vec_mul(vec_sub(r.v, mi.v), hi.v);
456
457 /* compute the potential and its derivative */
458 eff.v = vec_load4(data[0][2], data[1][2], data[2][2], data[3][2]);
459 c.v = vec_load4(data[0][3], data[1][3], data[2][3], data[3][3]);
460 ee.v = vec_madd(eff.v, x.v, c.v);
461 for(j = 4 ; j < potential_chunk ; j++) {
462 c.v = vec_load4(data[0][j], data[1][j], data[2][j], data[3][j]);
463 eff.v = vec_madd(eff.v, x.v, ee.v);
464 ee.v = vec_madd(ee.v, x.v, c.v);
465 }
466
467 /* store the result */
468 *((vector float *)e) = ee.v;
469 *((vector float *)f) = vec_mul(eff.v, vec_div(hi.v, r.v));
470
471 #else
472 int k;
473 FPTYPE ee, eff;
474 for(k = 0 ; k < 4 ; k++) {
475 potential_eval_r(p[k], r2[k], &ee, &eff);
476 e[k] = ee; f[k] = eff;
477 }
478 #endif
479
480 }
481
482
506 TF_ALWAYS_INLINE void potential_eval_vec_4single_old(struct Potential *p[4], float *r2, float *e, float *f) {
507
508 #if defined(__SSE__) && defined(FPTYPE_SINGLE)
509 int j, k;
510 union {
511 __v4sf v;
512 __m128i m;
513 float f[4];
514 int i[4];
515 } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r, ind;
516 float *data[4];
517
518 /* Get r . */
519 r.v = _mm_sqrt_ps(_mm_load_ps(r2));
520
521 /* compute the index */
522 alpha0.v = _mm_setr_ps(p[0]->alpha[0], p[1]->alpha[0], p[2]->alpha[0], p[3]->alpha[0]);
523 alpha1.v = _mm_setr_ps(p[0]->alpha[1], p[1]->alpha[1], p[2]->alpha[1], p[3]->alpha[1]);
524 alpha2.v = _mm_setr_ps(p[0]->alpha[2], p[1]->alpha[2], p[2]->alpha[2], p[3]->alpha[2]);
525 ind.m = _mm_cvttps_epi32(_mm_max_ps(_mm_setzero_ps(), _mm_add_ps(alpha0.v, _mm_mul_ps(r.v, _mm_add_ps(alpha1.v, _mm_mul_ps(r.v, alpha2.v))))));
526
527 /* get the table offset */
528 for(k = 0 ; k < 4 ; k++)
529 data[k] = &(p[k]->c[ ind.i[k] * potential_chunk ]);
530
531 /* adjust x to the interval */
532 mi.v = _mm_setr_ps(data[0][0], data[1][0], data[2][0], data[3][0]);
533 hi.v = _mm_setr_ps(data[0][1], data[1][1], data[2][1], data[3][1]);
534 x.v = _mm_mul_ps(_mm_sub_ps(r.v, mi.v), hi.v);
535
536 /* compute the potential and its derivative */
537 eff.v = _mm_setr_ps(data[0][2], data[1][2], data[2][2], data[3][2]);
538 c.v = _mm_setr_ps(data[0][3], data[1][3], data[2][3], data[3][3]);
539 ee.v = _mm_add_ps(_mm_mul_ps(eff.v, x.v), c.v);
540 for(j = 4 ; j < potential_chunk ; j++) {
541 c.v = _mm_setr_ps(data[0][j], data[1][j], data[2][j], data[3][j]);
542 eff.v = _mm_add_ps(_mm_mul_ps(eff.v, x.v), ee.v);
543 ee.v = _mm_add_ps(_mm_mul_ps(ee.v, x.v), c.v);
544 }
545
546 /* store the result */
547 _mm_store_ps(e, ee.v);
548 _mm_store_ps(f, _mm_mul_ps(eff.v, _mm_div_ps(hi.v, r.v)));
549
550 #elif defined(__ALTIVEC__) && defined(FPTYPE_SINGLE)
551 int j, k;
552 union {
553 vector float v;
554 float f[4];
555 } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r;
556 union {
557 vector unsigned int v;
558 unsigned int i[4];
559 } ind;
560 float *data[4];
561
562 /* Get r . */
563 r.v = vec_sqrt(*((vector float *)r2));
564
565 /* compute the index (vec_ctu maps negative floats to 0) */
566 alpha0.v = vec_load4(p[0]->alpha[0], p[1]->alpha[0], p[2]->alpha[0], p[3]->alpha[0]);
567 alpha1.v = vec_load4(p[0]->alpha[1], p[1]->alpha[1], p[2]->alpha[1], p[3]->alpha[1]);
568 alpha2.v = vec_load4(p[0]->alpha[2], p[1]->alpha[2], p[2]->alpha[2], p[3]->alpha[2]);
569 ind.v = vec_ctu(vec_madd(r.v, vec_madd(r.v, alpha2.v, alpha1.v), alpha0.v), 0);
570
571 /* get the table offset */
572 for(k = 0 ; k < 4 ; k++)
573 data[k] = &(p[k]->c[ ind.i[k] * potential_chunk ]);
574
575 /* adjust x to the interval */
576 mi.v = vec_load4(data[0][0], data[1][0], data[2][0], data[3][0]);
577 hi.v = vec_load4(data[0][1], data[1][1], data[2][1], data[3][1]);
578 x.v = vec_mul(vec_sub(r.v, mi.v), hi.v);
579
580 /* compute the potential and its derivative */
581 eff.v = vec_load4(data[0][2], data[1][2], data[2][2], data[3][2]);
582 c.v = vec_load4(data[0][3], data[1][3], data[2][3], data[3][3]);
583 ee.v = vec_madd(eff.v, x.v, c.v);
584 for(j = 4 ; j < potential_chunk ; j++) {
585 c.v = vec_load4(data[0][j], data[1][j], data[2][j], data[3][j]);
586 eff.v = vec_madd(eff.v, x.v, ee.v);
587 ee.v = vec_madd(ee.v, x.v, c.v);
588 }
589
590 /* store the result */
591 *((vector float *)e) = ee.v;
592 *((vector float *)f) = vec_mul(eff.v, vec_div(hi.v, r.v));
593
594 #else
595 int k;
596 FPTYPE ee, eff;
597 for(k = 0 ; k < 4 ; k++) {
598 potential_eval_r(p[k], r2[k], &ee, &eff);
599 e[k] = ee; f[k] = eff;
600 }
601 #endif
602
603 }
604
605
606 #ifdef STILL_NOT_READY_FOR_PRIME_TIME
607 TF_ALWAYS_INLINE void potential_eval_vec_4single_gccvec(struct Potential *p[4], float *r2, float *e, float *f) {
608
609 int j, k;
610 union {
611 vector(4,float) v;
612 float f[4];
613 } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r;
614 union {
615 vector(4,int) v;
616 int i[4];
617 } ind;
618 FPTYPE *data[4];
619
620 /* Get r . */
621 r.v = sqrtf(*((vector(4,float) *)r2));
622
623 /* compute the index */
624 for(k = 0 ; k < 4 ; k++) {
625 alpha0.f[k] = p[k]->alpha[0];
626 alpha1.f[k] = p[k]->alpha[1];
627 alpha2.f[k] = p[k]->alpha[2];
628 }
629 ind.v = max((vector(4,int)){0,0,0,0}, (vector(4,int))(alpha0.v + r.v*(alpha1.v + r.v*alpha2.v)));
630
631 /* get the table offset */
632 for(k = 0 ; k < 4 ; k++)
633 data[k] = &(p[k]->c[ ind.i[k] * potential_chunk ]);
634
635 /* adjust x to the interval */
636 for(k = 0 ; k < 4 ; k++) {
637 mi.f[k] = data[k][0];
638 hi.f[k] = data[k][1];
639 }
640 x.v = (r.v - mi.v) * hi.v;
641
642 /* compute the potential and its derivative */
643 for(k = 0 ; k < 4 ; k++) {
644 eff.f[k] = data[k][2];
645 c.f[k] = data[k][3];
646 }
647 ee.v = eff.v*x.v + c.v;
648 for(j = 4 ; j < potential_chunk ; j++) {
649 for(k = 0 ; k < 4 ; k++)
650 c.f[k] = data[k][j];
651 eff.v = eff.v*x.v + ee.v;
652 ee.v = ee.v*x.v + c.v;
653 }
654
655 /* store the result */
656 *((vector(4,float) *)e) = ee.v;
657 *((vector(4,float) *)f) = eff.v*(hi.v / r.v);
658
659 }
660 #endif
661
662
686 TF_ALWAYS_INLINE void potential_eval_vec_4single_r(struct Potential *p[4], float *r_in, float *e, float *f) {
687
688 #if defined(__SSE__) && defined(FPTYPE_SINGLE)
689 int j, k;
690 union {
691 __v4sf v;
692 __m128i m;
693 float f[4];
694 int i[4];
695 } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r, ind;
696 float *data[4];
697
698 /* Get r . */
699 r.v = _mm_load_ps(r_in);
700
701 /* compute the index */
702 alpha0.v = _mm_setr_ps(p[0]->alpha[0], p[1]->alpha[0], p[2]->alpha[0], p[3]->alpha[0]);
703 alpha1.v = _mm_setr_ps(p[0]->alpha[1], p[1]->alpha[1], p[2]->alpha[1], p[3]->alpha[1]);
704 alpha2.v = _mm_setr_ps(p[0]->alpha[2], p[1]->alpha[2], p[2]->alpha[2], p[3]->alpha[2]);
705 ind.m = _mm_cvttps_epi32(_mm_max_ps(_mm_setzero_ps(), _mm_add_ps(alpha0.v, _mm_mul_ps(r.v, _mm_add_ps(alpha1.v, _mm_mul_ps(r.v, alpha2.v))))));
706
707 /* get the table offset */
708 for(k = 0 ; k < 4 ; k++)
709 data[k] = &(p[k]->c[ ind.i[k] * potential_chunk ]);
710
711 /* adjust x to the interval */
712 mi.v = _mm_setr_ps(data[0][0], data[1][0], data[2][0], data[3][0]);
713 hi.v = _mm_setr_ps(data[0][1], data[1][1], data[2][1], data[3][1]);
714 x.v = _mm_mul_ps(_mm_sub_ps(r.v, mi.v), hi.v);
715
716 /* compute the potential and its derivative */
717 eff.v = _mm_setr_ps(data[0][2], data[1][2], data[2][2], data[3][2]);
718 c.v = _mm_setr_ps(data[0][3], data[1][3], data[2][3], data[3][3]);
719 ee.v = _mm_add_ps(_mm_mul_ps(eff.v, x.v), c.v);
720 for(j = 4 ; j < potential_chunk ; j++) {
721 c.v = _mm_setr_ps(data[0][j], data[1][j], data[2][j], data[3][j]);
722 eff.v = _mm_add_ps(_mm_mul_ps(eff.v, x.v), ee.v);
723 ee.v = _mm_add_ps(_mm_mul_ps(ee.v, x.v), c.v);
724 }
725
726 /* store the result */
727 _mm_store_ps(e, ee.v);
728 _mm_store_ps(f, _mm_mul_ps(eff.v, hi.v));
729
730 #elif defined(__ALTIVEC__) && defined(FPTYPE_SINGLE)
731 int j, k;
732 union {
733 vector float v;
734 float f[4];
735 } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r;
736 union {
737 vector unsigned int v;
738 unsigned int i[4];
739 } ind;
740 float *data[4];
741
742 /* Get r . */
743 r.v = *((vector float *)r_in);
744
745 /* compute the index (vec_ctu maps negative floats to 0) */
746 alpha0.v = vec_load4(p[0]->alpha[0], p[1]->alpha[0], p[2]->alpha[0], p[3]->alpha[0]);
747 alpha1.v = vec_load4(p[0]->alpha[1], p[1]->alpha[1], p[2]->alpha[1], p[3]->alpha[1]);
748 alpha2.v = vec_load4(p[0]->alpha[2], p[1]->alpha[2], p[2]->alpha[2], p[3]->alpha[2]);
749 ind.v = vec_ctu(vec_madd(r.v, vec_madd(r.v, alpha2.v, alpha1.v), alpha0.v), 0);
750
751 /* get the table offset */
752 for(k = 0 ; k < 4 ; k++)
753 data[k] = &(p[k]->c[ ind.i[k] * potential_chunk ]);
754
755 /* adjust x to the interval */
756 mi.v = vec_load4(data[0][0], data[1][0], data[2][0], data[3][0]);
757 hi.v = vec_load4(data[0][1], data[1][1], data[2][1], data[3][1]);
758 x.v = vec_mul(vec_sub(r.v, mi.v), hi.v);
759
760 /* compute the potential and its derivative */
761 eff.v = vec_load4(data[0][2], data[1][2], data[2][2], data[3][2]);
762 c.v = vec_load4(data[0][3], data[1][3], data[2][3], data[3][3]);
763 ee.v = vec_madd(eff.v, x.v, c.v);
764 for(j = 4 ; j < potential_chunk ; j++) {
765 c.v = vec_load4(data[0][j], data[1][j], data[2][j], data[3][j]);
766 eff.v = vec_madd(eff.v, x.v, ee.v);
767 ee.v = vec_madd(ee.v, x.v, c.v);
768 }
769
770 /* store the result */
771 *((vector float *)e) = ee.v;
772 *((vector float *)f) = vec_mul(eff.v, hi.v);
773
774 #else
775 int k;
776 FPTYPE ee, eff;
777 for(k = 0 ; k < 4 ; k++) {
778 potential_eval(p[k], r_in[k], &ee, &eff);
779 e[k] = ee; f[k] = eff;
780 }
781 #endif
782
783 }
784
785
809 TF_ALWAYS_INLINE void potential_eval_vec_8single(struct Potential *p[8], float *r2, float *e, float *f) {
810
811 #if defined(__AVX__) && defined(FPTYPE_SINGLE)
812 int j;
813 union {
814 __v8sf v;
815 __m256i m;
816 float f[8];
817 int i[8];
818 } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r, ind;
819 float *data[8];
820
821 /* Get r . */
822 r.v = _mm256_sqrt_ps(_mm256_load_ps(r2));
823
824 /* compute the index */
825 alpha0.v = _mm256_setr_ps(p[0]->alpha[0], p[1]->alpha[0], p[2]->alpha[0], p[3]->alpha[0], p[4]->alpha[0], p[5]->alpha[0], p[6]->alpha[0], p[7]->alpha[0]);
826 alpha1.v = _mm256_setr_ps(p[0]->alpha[1], p[1]->alpha[1], p[2]->alpha[1], p[3]->alpha[1], p[4]->alpha[1], p[5]->alpha[1], p[6]->alpha[1], p[7]->alpha[1]);
827 alpha2.v = _mm256_setr_ps(p[0]->alpha[2], p[1]->alpha[2], p[2]->alpha[2], p[3]->alpha[2], p[4]->alpha[2], p[5]->alpha[2], p[6]->alpha[2], p[7]->alpha[2]);
828 ind.m = _mm256_cvttps_epi32(_mm256_max_ps(_mm256_setzero_ps(), _mm256_add_ps(alpha0.v, _mm256_mul_ps(r.v, _mm256_add_ps(alpha1.v, _mm256_mul_ps(r.v, alpha2.v))))));
829
830 /* get the table offset */
831 data[0] = &(p[0]->c[ ind.i[0] * potential_chunk ]);
832 data[1] = &(p[1]->c[ ind.i[1] * potential_chunk ]);
833 data[2] = &(p[2]->c[ ind.i[2] * potential_chunk ]);
834 data[3] = &(p[3]->c[ ind.i[3] * potential_chunk ]);
835 data[4] = &(p[4]->c[ ind.i[4] * potential_chunk ]);
836 data[5] = &(p[5]->c[ ind.i[5] * potential_chunk ]);
837 data[6] = &(p[6]->c[ ind.i[6] * potential_chunk ]);
838 data[7] = &(p[7]->c[ ind.i[7] * potential_chunk ]);
839
840 /* adjust x to the interval */
841 mi.v = _mm256_setr_ps(data[0][0], data[1][0], data[2][0], data[3][0], data[4][0], data[5][0], data[6][0], data[7][0]);
842 hi.v = _mm256_setr_ps(data[0][1], data[1][1], data[2][1], data[3][1], data[4][1], data[5][1], data[6][1], data[7][1]);
843 x.v = _mm256_mul_ps(_mm256_sub_ps(r.v, mi.v), hi.v);
844
845 /* compute the potential and its derivative */
846 eff.v = _mm256_setr_ps(data[0][2], data[1][2], data[2][2], data[3][2], data[4][2], data[5][2], data[6][2], data[7][2]);
847 c.v = _mm256_setr_ps(data[0][3], data[1][3], data[2][3], data[3][3], data[4][3], data[5][3], data[6][3], data[7][3]);
848 ee.v = _mm256_add_ps(_mm256_mul_ps(eff.v, x.v), c.v);
849 for(j = 4 ; j < potential_chunk ; j++) {
850 c.v = _mm256_setr_ps(data[0][j], data[1][j], data[2][j], data[3][j], data[4][j], data[5][j], data[6][j], data[7][j]);
851 eff.v = _mm256_add_ps(_mm256_mul_ps(eff.v, x.v), ee.v);
852 ee.v = _mm256_add_ps(_mm256_mul_ps(ee.v, x.v), c.v);
853 }
854
855 /* store the result */
856 _mm256_store_ps(e, ee.v);
857 _mm256_store_ps(f, _mm256_mul_ps(eff.v, _mm256_div_ps(hi.v, r.v)));
858
859 #elif defined(__SSE__) && defined(FPTYPE_SINGLE)
860 int j;
861 union {
862 __v4sf v;
863 __m128i m;
864 float f[4];
865 int i[4];
866 } alpha0_1, alpha1_1, alpha2_1, mi_1, hi_1, x_1, ee_1, eff_1, c_1, r_1, ind_1,
867 alpha0_2, alpha1_2, alpha2_2, mi_2, hi_2, x_2, ee_2, eff_2, c_2, r_2, ind_2;
868 float *data[8];
869
870 /* Get r . */
871 r_1.v = _mm_sqrt_ps(_mm_load_ps(&r2[0]));
872 r_2.v = _mm_sqrt_ps(_mm_load_ps(&r2[4]));
873
874 /* compute the index */
875 alpha0_1.v = _mm_setr_ps(p[0]->alpha[0], p[1]->alpha[0], p[2]->alpha[0], p[3]->alpha[0]);
876 alpha1_1.v = _mm_setr_ps(p[0]->alpha[1], p[1]->alpha[1], p[2]->alpha[1], p[3]->alpha[1]);
877 alpha2_1.v = _mm_setr_ps(p[0]->alpha[2], p[1]->alpha[2], p[2]->alpha[2], p[3]->alpha[2]);
878 alpha0_2.v = _mm_setr_ps(p[4]->alpha[0], p[5]->alpha[0], p[6]->alpha[0], p[7]->alpha[0]);
879 alpha1_2.v = _mm_setr_ps(p[4]->alpha[1], p[5]->alpha[1], p[6]->alpha[1], p[7]->alpha[1]);
880 alpha2_2.v = _mm_setr_ps(p[4]->alpha[2], p[5]->alpha[2], p[6]->alpha[2], p[7]->alpha[2]);
881 ind_1.m = _mm_cvttps_epi32(_mm_max_ps(_mm_setzero_ps(), _mm_add_ps(alpha0_1.v, _mm_mul_ps(r_1.v, _mm_add_ps(alpha1_1.v, _mm_mul_ps(r_1.v, alpha2_1.v))))));
882 ind_2.m = _mm_cvttps_epi32(_mm_max_ps(_mm_setzero_ps(), _mm_add_ps(alpha0_2.v, _mm_mul_ps(r_2.v, _mm_add_ps(alpha1_2.v, _mm_mul_ps(r_2.v, alpha2_2.v))))));
883
884 /* get the table offset */
885 data[0] = &(p[0]->c[ ind_1.i[0] * potential_chunk ]);
886 data[1] = &(p[1]->c[ ind_1.i[1] * potential_chunk ]);
887 data[2] = &(p[2]->c[ ind_1.i[2] * potential_chunk ]);
888 data[3] = &(p[3]->c[ ind_1.i[3] * potential_chunk ]);
889 data[4] = &(p[4]->c[ ind_2.i[0] * potential_chunk ]);
890 data[5] = &(p[5]->c[ ind_2.i[1] * potential_chunk ]);
891 data[6] = &(p[6]->c[ ind_2.i[2] * potential_chunk ]);
892 data[7] = &(p[7]->c[ ind_2.i[3] * potential_chunk ]);
893
894 /* adjust x to the interval */
895 mi_1.v = _mm_setr_ps(data[0][0], data[1][0], data[2][0], data[3][0]);
896 hi_1.v = _mm_setr_ps(data[0][1], data[1][1], data[2][1], data[3][1]);
897 mi_2.v = _mm_setr_ps(data[4][0], data[5][0], data[6][0], data[7][0]);
898 hi_2.v = _mm_setr_ps(data[4][1], data[5][1], data[6][1], data[7][1]);
899 x_1.v = _mm_mul_ps(_mm_sub_ps(r_1.v, mi_1.v), hi_1.v);
900 x_2.v = _mm_mul_ps(_mm_sub_ps(r_2.v, mi_2.v), hi_2.v);
901
902 /* compute the potential and its derivative */
903 eff_1.v = _mm_setr_ps(data[0][2], data[1][2], data[2][2], data[3][2]);
904 eff_2.v = _mm_setr_ps(data[4][2], data[5][2], data[6][2], data[7][2]);
905 c_1.v = _mm_setr_ps(data[0][3], data[1][3], data[2][3], data[3][3]);
906 c_2.v = _mm_setr_ps(data[4][3], data[5][3], data[6][3], data[7][3]);
907 ee_1.v = _mm_add_ps(_mm_mul_ps(eff_1.v, x_1.v), c_1.v);
908 ee_2.v = _mm_add_ps(_mm_mul_ps(eff_2.v, x_2.v), c_2.v);
909 for(j = 4 ; j < potential_chunk ; j++) {
910 c_1.v = _mm_setr_ps(data[0][j], data[1][j], data[2][j], data[3][j]);
911 c_2.v = _mm_setr_ps(data[4][j], data[5][j], data[6][j], data[7][j]);
912 eff_1.v = _mm_add_ps(_mm_mul_ps(eff_1.v, x_1.v), ee_1.v);
913 eff_2.v = _mm_add_ps(_mm_mul_ps(eff_2.v, x_2.v), ee_2.v);
914 ee_1.v = _mm_add_ps(_mm_mul_ps(ee_1.v, x_1.v), c_1.v);
915 ee_2.v = _mm_add_ps(_mm_mul_ps(ee_2.v, x_2.v), c_2.v);
916 }
917
918 /* store the result */
919 _mm_store_ps(&e[0], ee_1.v);
920 _mm_store_ps(&e[4], ee_2.v);
921 _mm_store_ps(&f[0], _mm_mul_ps(eff_1.v, _mm_div_ps(hi_1.v, r_1.v)));
922 _mm_store_ps(&f[4], _mm_mul_ps(eff_2.v, _mm_div_ps(hi_2.v, r_2.v)));
923
924 #elif defined(__ALTIVEC__) && defined(FPTYPE_SINGLE)
925 int j;
926 union {
927 vector float v;
928 vector unsigned int m;
929 float f[4];
930 unsigned int i[4];
931 } alpha0_1, alpha1_1, alpha2_1, mi_1, hi_1, x_1, ee_1, eff_1, c_1, r_1, ind_1,
932 alpha0_2, alpha1_2, alpha2_2, mi_2, hi_2, x_2, ee_2, eff_2, c_2, r_2, ind_2;
933 float *data[8];
934
935 /* Get r . */
936 r_1.v = vec_sqrt(*((vector float *)&r2[0]));
937 r_2.v = vec_sqrt(*((vector float *)&r2[4]));
938
939 /* compute the index */
940 alpha0_1.v = vec_load4(p[0]->alpha[0], p[1]->alpha[0], p[2]->alpha[0], p[3]->alpha[0]);
941 alpha1_1.v = vec_load4(p[0]->alpha[1], p[1]->alpha[1], p[2]->alpha[1], p[3]->alpha[1]);
942 alpha2_1.v = vec_load4(p[0]->alpha[2], p[1]->alpha[2], p[2]->alpha[2], p[3]->alpha[2]);
943 alpha0_2.v = vec_load4(p[4]->alpha[0], p[5]->alpha[0], p[6]->alpha[0], p[7]->alpha[0]);
944 alpha1_2.v = vec_load4(p[4]->alpha[1], p[5]->alpha[1], p[6]->alpha[1], p[7]->alpha[1]);
945 alpha2_2.v = vec_load4(p[4]->alpha[2], p[5]->alpha[2], p[6]->alpha[2], p[7]->alpha[2]);
946 ind_1.m = vec_ctu(vec_madd(r_1.v, vec_madd(r_1.v, alpha2_1.v, alpha1_1.v), alpha0_1.v), 0);
947 ind_2.m = vec_ctu(vec_madd(r_2.v, vec_madd(r_2.v, alpha2_2.v, alpha1_2.v), alpha0_2.v), 0);
948
949 /* get the table offset */
950 data[0] = &(p[0]->c[ ind_1.i[0] * potential_chunk ]);
951 data[1] = &(p[1]->c[ ind_1.i[1] * potential_chunk ]);
952 data[2] = &(p[2]->c[ ind_1.i[2] * potential_chunk ]);
953 data[3] = &(p[3]->c[ ind_1.i[3] * potential_chunk ]);
954 data[4] = &(p[4]->c[ ind_2.i[0] * potential_chunk ]);
955 data[5] = &(p[5]->c[ ind_2.i[1] * potential_chunk ]);
956 data[6] = &(p[6]->c[ ind_2.i[2] * potential_chunk ]);
957 data[7] = &(p[7]->c[ ind_2.i[3] * potential_chunk ]);
958
959 /* adjust x to the interval */
960 mi_1.v = vec_load4(data[0][0], data[1][0], data[2][0], data[3][0]);
961 hi_1.v = vec_load4(data[0][1], data[1][1], data[2][1], data[3][1]);
962 mi_2.v = vec_load4(data[4][0], data[5][0], data[6][0], data[7][0]);
963 hi_2.v = vec_load4(data[4][1], data[5][1], data[6][1], data[7][1]);
964 x_1.v = vec_mul(vec_sub(r_1.v, mi_1.v), hi_1.v);
965 x_2.v = vec_mul(vec_sub(r_2.v, mi_2.v), hi_2.v);
966
967 /* compute the potential and its derivative */
968 eff_1.v = vec_load4(data[0][2], data[1][2], data[2][2], data[3][2]);
969 eff_2.v = vec_load4(data[4][2], data[5][2], data[6][2], data[7][2]);
970 c_1.v = vec_load4(data[0][3], data[1][3], data[2][3], data[3][3]);
971 c_2.v = vec_load4(data[4][3], data[5][3], data[6][3], data[7][3]);
972 ee_1.v = vec_madd(eff_1.v, x_1.v, c_1.v);
973 ee_2.v = vec_madd(eff_2.v, x_2.v, c_2.v);
974 for(j = 4 ; j < potential_chunk ; j++) {
975 c_1.v = vec_load4(data[0][j], data[1][j], data[2][j], data[3][j]);
976 c_2.v = vec_load4(data[4][j], data[5][j], data[6][j], data[7][j]);
977 eff_1.v = vec_madd(eff_1.v, x_1.v, ee_1.v);
978 eff_2.v = vec_madd(eff_2.v, x_2.v, ee_2.v);
979 ee_1.v = vec_madd(ee_1.v, x_1.v, c_1.v);
980 ee_2.v = vec_madd(ee_2.v, x_2.v, c_2.v);
981 }
982
983 /* store the result */
984 eff_1.v = vec_mul(eff_1.v, vec_div(hi_1.v, r_1.v));
985 eff_2.v = vec_mul(eff_2.v, vec_div(hi_2.v, r_2.v));
986 memcpy(&e[0], &ee_1, sizeof(vector float));
987 memcpy(&f[0], &eff_1, sizeof(vector float));
988 memcpy(&e[4], &ee_2, sizeof(vector float));
989 memcpy(&f[4], &eff_2, sizeof(vector float));
990
991 #else
992 int k;
993 FPTYPE ee, eff;
994 for(k = 0 ; k < 8 ; k++) {
995 potential_eval(p[k], r2[k], &ee, &eff);
996 e[k] = ee; f[k] = eff;
997 }
998 #endif
999
1000 }
1001
1002
1003 TF_ALWAYS_INLINE void potential_eval_vec_8single_r(struct Potential *p[8], float *r2, float *e, float *f) {
1004
1005 #if defined(__AVX__) && defined(FPTYPE_SINGLE)
1006 int j;
1007 union {
1008 __v8sf v;
1009 __m256i m;
1010 float f[8];
1011 int i[8];
1012 } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r, ind;
1013 float *data[8];
1014
1015 /* Get r . */
1016 r.v = _mm256_load_ps(r2);
1017
1018 /* compute the index */
1019 alpha0.v = _mm256_setr_ps(p[0]->alpha[0], p[1]->alpha[0], p[2]->alpha[0], p[3]->alpha[0], p[4]->alpha[0], p[5]->alpha[0], p[6]->alpha[0], p[7]->alpha[0]);
1020 alpha1.v = _mm256_setr_ps(p[0]->alpha[1], p[1]->alpha[1], p[2]->alpha[1], p[3]->alpha[1], p[4]->alpha[1], p[5]->alpha[1], p[6]->alpha[1], p[7]->alpha[1]);
1021 alpha2.v = _mm256_setr_ps(p[0]->alpha[2], p[1]->alpha[2], p[2]->alpha[2], p[3]->alpha[2], p[4]->alpha[2], p[5]->alpha[2], p[6]->alpha[2], p[7]->alpha[2]);
1022 ind.m = _mm256_cvttps_epi32(_mm256_max_ps(_mm256_setzero_ps(), _mm256_add_ps(alpha0.v, _mm256_mul_ps(r.v, _mm256_add_ps(alpha1.v, _mm256_mul_ps(r.v, alpha2.v))))));
1023
1024 /* get the table offset */
1025 data[0] = &(p[0]->c[ ind.i[0] * potential_chunk ]);
1026 data[1] = &(p[1]->c[ ind.i[1] * potential_chunk ]);
1027 data[2] = &(p[2]->c[ ind.i[2] * potential_chunk ]);
1028 data[3] = &(p[3]->c[ ind.i[3] * potential_chunk ]);
1029 data[4] = &(p[4]->c[ ind.i[4] * potential_chunk ]);
1030 data[5] = &(p[5]->c[ ind.i[5] * potential_chunk ]);
1031 data[6] = &(p[6]->c[ ind.i[6] * potential_chunk ]);
1032 data[7] = &(p[7]->c[ ind.i[7] * potential_chunk ]);
1033
1034 /* adjust x to the interval */
1035 mi.v = _mm256_setr_ps(data[0][0], data[1][0], data[2][0], data[3][0], data[4][0], data[5][0], data[6][0], data[7][0]);
1036 hi.v = _mm256_setr_ps(data[0][1], data[1][1], data[2][1], data[3][1], data[4][1], data[5][1], data[6][1], data[7][1]);
1037 x.v = _mm256_mul_ps(_mm256_sub_ps(r.v, mi.v), hi.v);
1038
1039 /* compute the potential and its derivative */
1040 eff.v = _mm256_setr_ps(data[0][2], data[1][2], data[2][2], data[3][2], data[4][2], data[5][2], data[6][2], data[7][2]);
1041 c.v = _mm256_setr_ps(data[0][3], data[1][3], data[2][3], data[3][3], data[4][3], data[5][3], data[6][3], data[7][3]);
1042 ee.v = _mm256_add_ps(_mm256_mul_ps(eff.v, x.v), c.v);
1043 for(j = 4 ; j < potential_chunk ; j++) {
1044 c.v = _mm256_setr_ps(data[0][j], data[1][j], data[2][j], data[3][j], data[4][j], data[5][j], data[6][j], data[7][j]);
1045 eff.v = _mm256_add_ps(_mm256_mul_ps(eff.v, x.v), ee.v);
1046 ee.v = _mm256_add_ps(_mm256_mul_ps(ee.v, x.v), c.v);
1047 }
1048
1049 /* store the result */
1050 _mm256_store_ps(e, ee.v);
1051 _mm256_store_ps(f, _mm256_mul_ps(eff.v, hi.v));
1052
1053 #elif defined(__SSE__) && defined(FPTYPE_SINGLE)
1054 int j;
1055 union {
1056 __v4sf v;
1057 __m128i m;
1058 float f[4];
1059 int i[4];
1060 } alpha0_1, alpha1_1, alpha2_1, mi_1, hi_1, x_1, ee_1, eff_1, c_1, r_1, ind_1,
1061 alpha0_2, alpha1_2, alpha2_2, mi_2, hi_2, x_2, ee_2, eff_2, c_2, r_2, ind_2;
1062 float *data[8];
1063
1064 /* Get r . */
1065 r_1.v = _mm_load_ps(&r2[0]);
1066 r_2.v = _mm_load_ps(&r2[4]);
1067
1068 /* compute the index */
1069 alpha0_1.v = _mm_setr_ps(p[0]->alpha[0], p[1]->alpha[0], p[2]->alpha[0], p[3]->alpha[0]);
1070 alpha1_1.v = _mm_setr_ps(p[0]->alpha[1], p[1]->alpha[1], p[2]->alpha[1], p[3]->alpha[1]);
1071 alpha2_1.v = _mm_setr_ps(p[0]->alpha[2], p[1]->alpha[2], p[2]->alpha[2], p[3]->alpha[2]);
1072 alpha0_2.v = _mm_setr_ps(p[4]->alpha[0], p[5]->alpha[0], p[6]->alpha[0], p[7]->alpha[0]);
1073 alpha1_2.v = _mm_setr_ps(p[4]->alpha[1], p[5]->alpha[1], p[6]->alpha[1], p[7]->alpha[1]);
1074 alpha2_2.v = _mm_setr_ps(p[4]->alpha[2], p[5]->alpha[2], p[6]->alpha[2], p[7]->alpha[2]);
1075 ind_1.m = _mm_cvttps_epi32(_mm_max_ps(_mm_setzero_ps(), _mm_add_ps(alpha0_1.v, _mm_mul_ps(r_1.v, _mm_add_ps(alpha1_1.v, _mm_mul_ps(r_1.v, alpha2_1.v))))));
1076 ind_2.m = _mm_cvttps_epi32(_mm_max_ps(_mm_setzero_ps(), _mm_add_ps(alpha0_2.v, _mm_mul_ps(r_2.v, _mm_add_ps(alpha1_2.v, _mm_mul_ps(r_2.v, alpha2_2.v))))));
1077
1078 /* get the table offset */
1079 data[0] = &(p[0]->c[ ind_1.i[0] * potential_chunk ]);
1080 data[1] = &(p[1]->c[ ind_1.i[1] * potential_chunk ]);
1081 data[2] = &(p[2]->c[ ind_1.i[2] * potential_chunk ]);
1082 data[3] = &(p[3]->c[ ind_1.i[3] * potential_chunk ]);
1083 data[4] = &(p[4]->c[ ind_2.i[0] * potential_chunk ]);
1084 data[5] = &(p[5]->c[ ind_2.i[1] * potential_chunk ]);
1085 data[6] = &(p[6]->c[ ind_2.i[2] * potential_chunk ]);
1086 data[7] = &(p[7]->c[ ind_2.i[3] * potential_chunk ]);
1087
1088 /* adjust x to the interval */
1089 mi_1.v = _mm_setr_ps(data[0][0], data[1][0], data[2][0], data[3][0]);
1090 hi_1.v = _mm_setr_ps(data[0][1], data[1][1], data[2][1], data[3][1]);
1091 mi_2.v = _mm_setr_ps(data[4][0], data[5][0], data[6][0], data[7][0]);
1092 hi_2.v = _mm_setr_ps(data[4][1], data[5][1], data[6][1], data[7][1]);
1093 x_1.v = _mm_mul_ps(_mm_sub_ps(r_1.v, mi_1.v), hi_1.v);
1094 x_2.v = _mm_mul_ps(_mm_sub_ps(r_2.v, mi_2.v), hi_2.v);
1095
1096 /* compute the potential and its derivative */
1097 eff_1.v = _mm_setr_ps(data[0][2], data[1][2], data[2][2], data[3][2]);
1098 eff_2.v = _mm_setr_ps(data[4][2], data[5][2], data[6][2], data[7][2]);
1099 c_1.v = _mm_setr_ps(data[0][3], data[1][3], data[2][3], data[3][3]);
1100 c_2.v = _mm_setr_ps(data[4][3], data[5][3], data[6][3], data[7][3]);
1101 ee_1.v = _mm_add_ps(_mm_mul_ps(eff_1.v, x_1.v), c_1.v);
1102 ee_2.v = _mm_add_ps(_mm_mul_ps(eff_2.v, x_2.v), c_2.v);
1103 for(j = 4 ; j < potential_chunk ; j++) {
1104 c_1.v = _mm_setr_ps(data[0][j], data[1][j], data[2][j], data[3][j]);
1105 c_2.v = _mm_setr_ps(data[4][j], data[5][j], data[6][j], data[7][j]);
1106 eff_1.v = _mm_add_ps(_mm_mul_ps(eff_1.v, x_1.v), ee_1.v);
1107 eff_2.v = _mm_add_ps(_mm_mul_ps(eff_2.v, x_2.v), ee_2.v);
1108 ee_1.v = _mm_add_ps(_mm_mul_ps(ee_1.v, x_1.v), c_1.v);
1109 ee_2.v = _mm_add_ps(_mm_mul_ps(ee_2.v, x_2.v), c_2.v);
1110 }
1111
1112 /* store the result */
1113 _mm_store_ps(&e[0], ee_1.v);
1114 _mm_store_ps(&e[4], ee_2.v);
1115 _mm_store_ps(&f[0], _mm_mul_ps(eff_1.v, hi_1.v));
1116 _mm_store_ps(&f[4], _mm_mul_ps(eff_2.v, hi_2.v));
1117
1118 #elif defined(__ALTIVEC__) && defined(FPTYPE_SINGLE)
1119 int j;
1120 union {
1121 vector float v;
1122 vector unsigned int m;
1123 float f[4];
1124 unsigned int i[4];
1125 } alpha0_1, alpha1_1, alpha2_1, mi_1, hi_1, x_1, ee_1, eff_1, c_1, r_1, ind_1,
1126 alpha0_2, alpha1_2, alpha2_2, mi_2, hi_2, x_2, ee_2, eff_2, c_2, r_2, ind_2;
1127 float *data[8];
1128
1129 /* Get r . */
1130 r_1.v = *((vector float *)&r2[0]);
1131 r_2.v = *((vector float *)&r2[4]);
1132
1133 /* compute the index */
1134 alpha0_1.v = vec_load4(p[0]->alpha[0], p[1]->alpha[0], p[2]->alpha[0], p[3]->alpha[0]);
1135 alpha1_1.v = vec_load4(p[0]->alpha[1], p[1]->alpha[1], p[2]->alpha[1], p[3]->alpha[1]);
1136 alpha2_1.v = vec_load4(p[0]->alpha[2], p[1]->alpha[2], p[2]->alpha[2], p[3]->alpha[2]);
1137 alpha0_2.v = vec_load4(p[4]->alpha[0], p[5]->alpha[0], p[6]->alpha[0], p[7]->alpha[0]);
1138 alpha1_2.v = vec_load4(p[4]->alpha[1], p[5]->alpha[1], p[6]->alpha[1], p[7]->alpha[1]);
1139 alpha2_2.v = vec_load4(p[4]->alpha[2], p[5]->alpha[2], p[6]->alpha[2], p[7]->alpha[2]);
1140 ind_1.m = vec_ctu(vec_madd(r_1.v, vec_madd(r_1.v, alpha2_1.v, alpha1_1.v), alpha0_1.v), 0);
1141 ind_2.m = vec_ctu(vec_madd(r_2.v, vec_madd(r_2.v, alpha2_2.v, alpha1_2.v), alpha0_2.v), 0);
1142
1143 /* get the table offset */
1144 data[0] = &(p[0]->c[ ind_1.i[0] * potential_chunk ]);
1145 data[1] = &(p[1]->c[ ind_1.i[1] * potential_chunk ]);
1146 data[2] = &(p[2]->c[ ind_1.i[2] * potential_chunk ]);
1147 data[3] = &(p[3]->c[ ind_1.i[3] * potential_chunk ]);
1148 data[4] = &(p[4]->c[ ind_2.i[0] * potential_chunk ]);
1149 data[5] = &(p[5]->c[ ind_2.i[1] * potential_chunk ]);
1150 data[6] = &(p[6]->c[ ind_2.i[2] * potential_chunk ]);
1151 data[7] = &(p[7]->c[ ind_2.i[3] * potential_chunk ]);
1152
1153 /* adjust x to the interval */
1154 mi_1.v = vec_load4(data[0][0], data[1][0], data[2][0], data[3][0]);
1155 hi_1.v = vec_load4(data[0][1], data[1][1], data[2][1], data[3][1]);
1156 mi_2.v = vec_load4(data[4][0], data[5][0], data[6][0], data[7][0]);
1157 hi_2.v = vec_load4(data[4][1], data[5][1], data[6][1], data[7][1]);
1158 x_1.v = vec_mul(vec_sub(r_1.v, mi_1.v), hi_1.v);
1159 x_2.v = vec_mul(vec_sub(r_2.v, mi_2.v), hi_2.v);
1160
1161 /* compute the potential and its derivative */
1162 eff_1.v = vec_load4(data[0][2], data[1][2], data[2][2], data[3][2]);
1163 eff_2.v = vec_load4(data[4][2], data[5][2], data[6][2], data[7][2]);
1164 c_1.v = vec_load4(data[0][3], data[1][3], data[2][3], data[3][3]);
1165 c_2.v = vec_load4(data[4][3], data[5][3], data[6][3], data[7][3]);
1166 ee_1.v = vec_madd(eff_1.v, x_1.v, c_1.v);
1167 ee_2.v = vec_madd(eff_2.v, x_2.v, c_2.v);
1168 for(j = 4 ; j < potential_chunk ; j++) {
1169 c_1.v = vec_load4(data[0][j], data[1][j], data[2][j], data[3][j]);
1170 c_2.v = vec_load4(data[4][j], data[5][j], data[6][j], data[7][j]);
1171 eff_1.v = vec_madd(eff_1.v, x_1.v, ee_1.v);
1172 eff_2.v = vec_madd(eff_2.v, x_2.v, ee_2.v);
1173 ee_1.v = vec_madd(ee_1.v, x_1.v, c_1.v);
1174 ee_2.v = vec_madd(ee_2.v, x_2.v, c_2.v);
1175 }
1176
1177 /* store the result */
1178 eff_1.v = vec_mul(eff_1.v, hi_1.v);
1179 eff_2.v = vec_mul(eff_2.v, hi_2.v);
1180 memcpy(&e[0], &ee_1, sizeof(vector float));
1181 memcpy(&f[0], &eff_1, sizeof(vector float));
1182 memcpy(&e[4], &ee_2, sizeof(vector float));
1183 memcpy(&f[4], &eff_2, sizeof(vector float));
1184
1185 #else
1186 int k;
1187 FPTYPE ee, eff;
1188 for(k = 0 ; k < 8 ; k++) {
1189 potential_eval(p[k], r2[k], &ee, &eff);
1190 e[k] = ee; f[k] = eff;
1191 }
1192 #endif
1193
1194 }
1195
1196
1220 TF_ALWAYS_INLINE void potential_eval_vec_2double(struct Potential *p[2], FPTYPE *r2, FPTYPE *e, FPTYPE *f) {
1221
1222 #if defined(__SSE2__) && defined(FPTYPE_DOUBLE)
1223 int ind[2], j;
1224 union {
1225 __v2df v;
1226 double f[2];
1227 } alpha0, alpha1, alpha2, rind, mi, hi, x, ee, eff, c, r;
1228 double *data[2];
1229
1230 /* Get r . */
1231 r.v = _mm_sqrt_pd(_mm_load_pd(r2));
1232
1233 /* compute the index */
1234 alpha0.v = _mm_setr_pd(p[0]->alpha[0], p[1]->alpha[0]);
1235 alpha1.v = _mm_setr_pd(p[0]->alpha[1], p[1]->alpha[1]);
1236 alpha2.v = _mm_setr_pd(p[0]->alpha[2], p[1]->alpha[2]);
1237 rind.v = _mm_max_pd(_mm_setzero_pd(), _mm_add_pd(alpha0.v, _mm_mul_pd(r.v, _mm_add_pd(alpha1.v, _mm_mul_pd(r.v, alpha2.v)))));
1238 ind[0] = rind.f[0];
1239 ind[1] = rind.f[1];
1240
1241 /* get the table offset */
1242 data[0] = &(p[0]->c[ ind[0] * potential_chunk ]);
1243 data[1] = &(p[1]->c[ ind[1] * potential_chunk ]);
1244
1245 /* adjust x to the interval */
1246 mi.v = _mm_setr_pd(data[0][0], data[1][0]);
1247 hi.v = _mm_setr_pd(data[0][1], data[1][1]);
1248 x.v = _mm_mul_pd(_mm_sub_pd(r.v, mi.v), hi.v);
1249
1250 /* compute the potential and its derivative */
1251 eff.v = _mm_setr_pd(data[0][2], data[1][2]);
1252 c.v = _mm_setr_pd(data[0][3], data[1][3]);
1253 ee.v = _mm_add_pd(_mm_mul_pd(eff.v, x.v), c.v);
1254 for(j = 4 ; j < potential_chunk ; j++) {
1255 c.v = _mm_setr_pd(data[0][j], data[1][j]);
1256 eff.v = _mm_add_pd(_mm_mul_pd(eff.v, x.v), ee.v);
1257 ee.v = _mm_add_pd(_mm_mul_pd(ee.v, x.v), c.v);
1258 }
1259
1260 /* store the result */
1261 _mm_store_pd(e, ee.v);
1262 _mm_store_pd(f, _mm_mul_pd(eff.v, _mm_div_pd(hi.v, r.v)));
1263
1264 #else
1265 int k;
1266 for(k = 0 ; k < 2 ; k++)
1267 potential_eval(p[k], r2[k], &e[k], &f[k]);
1268 #endif
1269
1270 }
1271
1272
1296 TF_ALWAYS_INLINE void potential_eval_vec_4double(struct Potential *p[4], FPTYPE *r2, FPTYPE *e, FPTYPE *f) {
1297
1298 #if defined(__AVX__) && defined(FPTYPE_DOUBLE)
1299 int ind[4], j;
1300 union {
1301 __v4df v;
1302 double f[4];
1303 } alpha0, alpha1, alpha2, rind, mi, hi, x, ee, eff, c, r;
1304 double *data[4];
1305
1306 /* Get r . */
1307 r.v = _mm256_sqrt_pd(_mm256_load_pd(r2));
1308
1309 /* compute the index */
1310 alpha0.v = _mm256_setr_pd(p[0]->alpha[0], p[1]->alpha[0], p[2]->alpha[0], p[3]->alpha[0]);
1311 alpha1.v = _mm256_setr_pd(p[0]->alpha[1], p[1]->alpha[1], p[2]->alpha[1], p[3]->alpha[1]);
1312 alpha2.v = _mm256_setr_pd(p[0]->alpha[2], p[1]->alpha[2], p[2]->alpha[2], p[3]->alpha[2]);
1313 rind.v = _mm256_max_pd(_mm256_setzero_pd(), _mm256_add_pd(alpha0.v, _mm256_mul_pd(r.v, _mm256_add_pd(alpha1.v, _mm256_mul_pd(r.v, alpha2.v)))));
1314 ind[0] = rind.f[0];
1315 ind[1] = rind.f[1];
1316 ind[2] = rind.f[2];
1317 ind[3] = rind.f[3];
1318
1319 /* get the table offset */
1320 data[0] = &(p[0]->c[ ind[0] * potential_chunk ]);
1321 data[1] = &(p[1]->c[ ind[1] * potential_chunk ]);
1322 data[2] = &(p[2]->c[ ind[2] * potential_chunk ]);
1323 data[3] = &(p[3]->c[ ind[3] * potential_chunk ]);
1324
1325 /* adjust x to the interval */
1326 mi.v = _mm256_setr_pd(data[0][0], data[1][0], data[2][0], data[3][0]);
1327 hi.v = _mm256_setr_pd(data[0][1], data[1][1], data[2][1], data[3][1]);
1328 x.v = _mm256_mul_pd(_mm256_sub_pd(r.v, mi.v), hi.v);
1329
1330 /* compute the potential and its derivative */
1331 eff.v = _mm256_setr_pd(data[0][2], data[1][2], data[2][2], data[3][2]);
1332 c.v = _mm256_setr_pd(data[0][3], data[1][3], data[2][3], data[3][3]);
1333 ee.v = _mm256_add_pd(_mm256_mul_pd(eff.v, x.v), c.v);
1334 for(j = 4 ; j < potential_chunk ; j++) {
1335 c.v = _mm256_setr_pd(data[0][j], data[1][j], data[2][j], data[3][j]);
1336 eff.v = _mm256_add_pd(_mm256_mul_pd(eff.v, x.v), ee.v);
1337 ee.v = _mm256_add_pd(_mm256_mul_pd(ee.v, x.v), c.v);
1338 }
1339
1340 /* store the result */
1341 _mm256_store_pd(e, ee.v);
1342 _mm256_store_pd(f, _mm256_mul_pd(eff.v, _mm256_div_pd(hi.v, r.v)));
1343
1344 #elif defined(__SSE2__) && defined(FPTYPE_DOUBLE)
1345 int ind[4], j;
1346 union {
1347 __v2df v;
1348 double f[2];
1349 } alpha0_1, alpha1_1, alpha2_1, rind_1, mi_1, hi_1, x_1, ee_1, eff_1, c_1, r_1,
1350 alpha0_2, alpha1_2, alpha2_2, rind_2, mi_2, hi_2, x_2, ee_2, eff_2, c_2, r_2;
1351 double *data[4];
1352
1353 /* Get r . */
1354 r_1.v = _mm_sqrt_pd(_mm_load_pd(&r2[0]));
1355 r_2.v = _mm_sqrt_pd(_mm_load_pd(&r2[2]));
1356
1357 /* compute the index */
1358 alpha0_1.v = _mm_setr_pd(p[0]->alpha[0], p[1]->alpha[0]);
1359 alpha1_1.v = _mm_setr_pd(p[0]->alpha[1], p[1]->alpha[1]);
1360 alpha2_1.v = _mm_setr_pd(p[0]->alpha[2], p[1]->alpha[2]);
1361 alpha0_2.v = _mm_setr_pd(p[2]->alpha[0], p[3]->alpha[0]);
1362 alpha1_2.v = _mm_setr_pd(p[2]->alpha[1], p[3]->alpha[1]);
1363 alpha2_2.v = _mm_setr_pd(p[2]->alpha[2], p[3]->alpha[2]);
1364 rind_1.v = _mm_max_pd(_mm_setzero_pd(), _mm_add_pd(alpha0_1.v, _mm_mul_pd(r_1.v, _mm_add_pd(alpha1_1.v, _mm_mul_pd(r_1.v, alpha2_1.v)))));
1365 rind_2.v = _mm_max_pd(_mm_setzero_pd(), _mm_add_pd(alpha0_2.v, _mm_mul_pd(r_2.v, _mm_add_pd(alpha1_2.v, _mm_mul_pd(r_2.v, alpha2_2.v)))));
1366 ind[0] = rind_1.f[0];
1367 ind[1] = rind_1.f[1];
1368 ind[2] = rind_2.f[0];
1369 ind[3] = rind_2.f[1];
1370
1371 /* get the table offset */
1372 data[0] = &(p[0]->c[ ind[0] * potential_chunk ]);
1373 data[1] = &(p[1]->c[ ind[1] * potential_chunk ]);
1374 data[2] = &(p[2]->c[ ind[2] * potential_chunk ]);
1375 data[3] = &(p[3]->c[ ind[3] * potential_chunk ]);
1376
1377 /* adjust x to the interval */
1378 mi_1.v = _mm_setr_pd(data[0][0], data[1][0]);
1379 hi_1.v = _mm_setr_pd(data[0][1], data[1][1]);
1380 mi_2.v = _mm_setr_pd(data[2][0], data[3][0]);
1381 hi_2.v = _mm_setr_pd(data[2][1], data[3][1]);
1382 x_1.v = _mm_mul_pd(_mm_sub_pd(r_1.v, mi_1.v), hi_1.v);
1383 x_2.v = _mm_mul_pd(_mm_sub_pd(r_2.v, mi_2.v), hi_2.v);
1384
1385 /* compute the potential and its derivative */
1386 eff_1.v = _mm_setr_pd(data[0][2], data[1][2]);
1387 eff_2.v = _mm_setr_pd(data[2][2], data[3][2]);
1388 c_1.v = _mm_setr_pd(data[0][3], data[1][3]);
1389 c_2.v = _mm_setr_pd(data[2][3], data[3][3]);
1390 ee_1.v = _mm_add_pd(_mm_mul_pd(eff_1.v, x_1.v), c_1.v);
1391 ee_2.v = _mm_add_pd(_mm_mul_pd(eff_2.v, x_2.v), c_2.v);
1392 for(j = 4 ; j < potential_chunk ; j++) {
1393 c_1.v = _mm_setr_pd(data[0][j], data[1][j]);
1394 c_2.v = _mm_setr_pd(data[2][j], data[3][j]);
1395 eff_1.v = _mm_add_pd(_mm_mul_pd(eff_1.v, x_1.v), ee_1.v);
1396 eff_2.v = _mm_add_pd(_mm_mul_pd(eff_2.v, x_2.v), ee_2.v);
1397 ee_1.v = _mm_add_pd(_mm_mul_pd(ee_1.v, x_1.v), c_1.v);
1398 ee_2.v = _mm_add_pd(_mm_mul_pd(ee_2.v, x_2.v), c_2.v);
1399 }
1400
1401 /* store the result */
1402 _mm_store_pd(&e[0], ee_1.v);
1403 _mm_store_pd(&f[0], _mm_mul_pd(eff_1.v, _mm_div_pd(hi_1.v, r_1.v)));
1404 _mm_store_pd(&e[2], ee_2.v);
1405 _mm_store_pd(&f[2], _mm_mul_pd(eff_2.v, _mm_div_pd(hi_2.v, r_2.v)));
1406
1407 #else
1408 int k;
1409 for(k = 0 ; k < 4 ; k++)
1410 potential_eval(p[k], r2[k], &e[k], &f[k]);
1411 #endif
1412
1413 }
1414
1415
1436 TF_ALWAYS_INLINE void potential_eval_vec_4double_r(struct Potential *p[4], FPTYPE *r_in, FPTYPE *e, FPTYPE *f) {
1437
1438 #if defined(__AVX__) && defined(FPTYPE_DOUBLE)
1439 int ind[4], j;
1440 union {
1441 __m256d v;
1442 double f[4];
1443 } alpha0, alpha1, alpha2, rind, mi, hi, x, ee, eff, c, r;
1444 double *data[4];
1445
1446 /* Get r . */
1447 r.v = _mm256_load_pd(r_in);
1448
1449 /* compute the index */
1450 alpha0.v = _mm256_setr_pd(p[0]->alpha[0], p[1]->alpha[0], p[2]->alpha[0], p[3]->alpha[0]);
1451 alpha1.v = _mm256_setr_pd(p[0]->alpha[1], p[1]->alpha[1], p[2]->alpha[1], p[3]->alpha[1]);
1452 alpha2.v = _mm256_setr_pd(p[0]->alpha[2], p[1]->alpha[2], p[2]->alpha[2], p[3]->alpha[2]);
1453 rind.v = _mm256_max_pd(_mm256_setzero_pd(), _mm256_add_pd(alpha0.v, _mm256_mul_pd(r.v, _mm256_add_pd(alpha1.v, _mm256_mul_pd(r.v, alpha2.v)))));
1454 ind[0] = rind.f[0];
1455 ind[1] = rind.f[1];
1456 ind[2] = rind.f[2];
1457 ind[3] = rind.f[3];
1458
1459 /* get the table offset */
1460 data[0] = &(p[0]->c[ ind[0] * potential_chunk ]);
1461 data[1] = &(p[1]->c[ ind[1] * potential_chunk ]);
1462 data[2] = &(p[2]->c[ ind[2] * potential_chunk ]);
1463 data[3] = &(p[3]->c[ ind[3] * potential_chunk ]);
1464
1465 /* adjust x to the interval */
1466 mi.v = _mm256_setr_pd(data[0][0], data[1][0], data[2][0], data[3][0]);
1467 hi.v = _mm256_setr_pd(data[0][1], data[1][1], data[2][1], data[3][1]);
1468 x.v = _mm256_mul_pd(_mm256_sub_pd(r.v, mi.v), hi.v);
1469
1470 /* compute the potential and its derivative */
1471 eff.v = _mm256_setr_pd(data[0][2], data[1][2], data[2][2], data[3][2]);
1472 c.v = _mm256_setr_pd(data[0][3], data[1][3], data[2][3], data[3][3]);
1473 ee.v = _mm256_add_pd(_mm256_mul_pd(eff.v, x.v), c.v);
1474 for(j = 4 ; j < potential_chunk ; j++) {
1475 c.v = _mm256_setr_pd(data[0][j], data[1][j], data[2][j], data[3][j]);
1476 eff.v = _mm256_add_pd(_mm256_mul_pd(eff.v, x.v), ee.v);
1477 ee.v = _mm256_add_pd(_mm256_mul_pd(ee.v, x.v), c.v);
1478 }
1479
1480 /* store the result */
1481 _mm256_store_pd(e, ee.v);
1482 _mm256_store_pd(f, _mm256_mul_pd(eff.v, hi.v));
1483
1484 #elif defined(__SSE2__) && defined(FPTYPE_DOUBLE)
1485 int ind[4], j;
1486 union {
1487 __v2df v;
1488 double f[2];
1489 } alpha0_1, alpha1_1, alpha2_1, rind_1, mi_1, hi_1, x_1, ee_1, eff_1, c_1, r_1,
1490 alpha0_2, alpha1_2, alpha2_2, rind_2, mi_2, hi_2, x_2, ee_2, eff_2, c_2, r_2;
1491 double *data[4];
1492
1493 /* Get r . */
1494 r_1.v = _mm_load_pd(&r_in[0]);
1495 r_2.v = _mm_load_pd(&r_in[2]);
1496
1497 /* compute the index */
1498 alpha0_1.v = _mm_setr_pd(p[0]->alpha[0], p[1]->alpha[0]);
1499 alpha1_1.v = _mm_setr_pd(p[0]->alpha[1], p[1]->alpha[1]);
1500 alpha2_1.v = _mm_setr_pd(p[0]->alpha[2], p[1]->alpha[2]);
1501 alpha0_2.v = _mm_setr_pd(p[2]->alpha[0], p[3]->alpha[0]);
1502 alpha1_2.v = _mm_setr_pd(p[2]->alpha[1], p[3]->alpha[1]);
1503 alpha2_2.v = _mm_setr_pd(p[2]->alpha[2], p[3]->alpha[2]);
1504 rind_1.v = _mm_max_pd(_mm_setzero_pd(), _mm_add_pd(alpha0_1.v, _mm_mul_pd(r_1.v, _mm_add_pd(alpha1_1.v, _mm_mul_pd(r_1.v, alpha2_1.v)))));
1505 rind_2.v = _mm_max_pd(_mm_setzero_pd(), _mm_add_pd(alpha0_2.v, _mm_mul_pd(r_2.v, _mm_add_pd(alpha1_2.v, _mm_mul_pd(r_2.v, alpha2_2.v)))));
1506 ind[0] = rind_1.f[0];
1507 ind[1] = rind_1.f[1];
1508 ind[2] = rind_2.f[0];
1509 ind[3] = rind_2.f[1];
1510
1511 /* get the table offset */
1512 data[0] = &(p[0]->c[ ind[0] * potential_chunk ]);
1513 data[1] = &(p[1]->c[ ind[1] * potential_chunk ]);
1514 data[2] = &(p[2]->c[ ind[2] * potential_chunk ]);
1515 data[3] = &(p[3]->c[ ind[3] * potential_chunk ]);
1516
1517 /* adjust x to the interval */
1518 mi_1.v = _mm_setr_pd(data[0][0], data[1][0]);
1519 hi_1.v = _mm_setr_pd(data[0][1], data[1][1]);
1520 mi_2.v = _mm_setr_pd(data[2][0], data[3][0]);
1521 hi_2.v = _mm_setr_pd(data[2][1], data[3][1]);
1522 x_1.v = _mm_mul_pd(_mm_sub_pd(r_1.v, mi_1.v), hi_1.v);
1523 x_2.v = _mm_mul_pd(_mm_sub_pd(r_2.v, mi_2.v), hi_2.v);
1524
1525 /* compute the potential and its derivative */
1526 eff_1.v = _mm_setr_pd(data[0][2], data[1][2]);
1527 eff_2.v = _mm_setr_pd(data[2][2], data[3][2]);
1528 c_1.v = _mm_setr_pd(data[0][3], data[1][3]);
1529 c_2.v = _mm_setr_pd(data[2][3], data[3][3]);
1530 ee_1.v = _mm_add_pd(_mm_mul_pd(eff_1.v, x_1.v), c_1.v);
1531 ee_2.v = _mm_add_pd(_mm_mul_pd(eff_2.v, x_2.v), c_2.v);
1532 for(j = 4 ; j < potential_chunk ; j++) {
1533 c_1.v = _mm_setr_pd(data[0][j], data[1][j]);
1534 c_2.v = _mm_setr_pd(data[2][j], data[3][j]);
1535 eff_1.v = _mm_add_pd(_mm_mul_pd(eff_1.v, x_1.v), ee_1.v);
1536 eff_2.v = _mm_add_pd(_mm_mul_pd(eff_2.v, x_2.v), ee_2.v);
1537 ee_1.v = _mm_add_pd(_mm_mul_pd(ee_1.v, x_1.v), c_1.v);
1538 ee_2.v = _mm_add_pd(_mm_mul_pd(ee_2.v, x_2.v), c_2.v);
1539 }
1540
1541 /* store the result */
1542 _mm_store_pd(&e[0], ee_1.v);
1543 _mm_store_pd(&f[0], _mm_mul_pd(eff_1.v, hi_1.v));
1544 _mm_store_pd(&e[2], ee_2.v);
1545 _mm_store_pd(&f[2], _mm_mul_pd(eff_2.v, hi_2.v));
1546
1547 #else
1548 int k;
1549 for(k = 0 ; k < 4 ; k++)
1550 potential_eval_r(p[k], r_in[k], &e[k], &f[k]);
1551 #endif
1552
1553 }
1554
1555 TF_ALWAYS_INLINE Potential *get_potential(const Particle *a, const Particle *b) {
1556 int index = _Engine.max_type * a->typeId + b->typeId;
1557 if ((a->flags & b->flags & PARTICLE_BOUND) && (a->clusterId == b->clusterId)) {
1558 return _Engine.p_cluster[index];
1559 }
1560 else {
1561 return _Engine.p[index];
1562 }
1563 }
1564
1565};
1566
1567#endif // _MDCORE_SOURCE_TF_POTENTIAL_EVAL_H_
Include Python header, disable linking to pythonX_d.lib on Windows in debug mode.
Definition tfAngleConfig.h:26
TF_ALWAYS_INLINE void potential_eval_vec_4single_r(struct Potential *p[4], float *r_in, float *e, float *f)
Evaluates the given potential at a set of points (interpolated).
Definition tf_potential_eval.h:686
TF_ALWAYS_INLINE FPTYPE fptype_r2(FPTYPE *x1, FPTYPE *x2, FPTYPE *dx)
Inlined function to compute the distance^2 between two vectors.
Definition tf_fptype.h:87
TF_ALWAYS_INLINE void potential_eval_expl(struct Potential *p, FPTYPE r2, FPTYPE *e, FPTYPE *f)
Evaluates the given potential at the given radius explicitly.
Definition tf_potential_eval.h:296
FPTYPE space_cell_gaussian(int cell_id)
struct TissueForge::space_cell space_cell
the space_cell structure
TF_ALWAYS_INLINE void potential_eval_vec_2double(struct Potential *p[2], FPTYPE *r2, FPTYPE *e, FPTYPE *f)
Evaluates the given potential at a set of points (interpolated).
Definition tf_potential_eval.h:1220
TF_ALWAYS_INLINE void potential_eval(struct Potential *p, FPTYPE r2, FPTYPE *e, FPTYPE *f)
Evaluates the given potential at the given point (interpolated).
Definition tf_potential_eval.h:74
TF_ALWAYS_INLINE void potential_eval_r(struct Potential *p, FPTYPE r, FPTYPE *e, FPTYPE *f)
Evaluates the given potential at the given point (interpolated).
Definition tf_potential_eval.h:165
TF_ALWAYS_INLINE void potential_eval_vec_4double(struct Potential *p[4], FPTYPE *r2, FPTYPE *e, FPTYPE *f)
Evaluates the given potential at a set of points (interpolated).
Definition tf_potential_eval.h:1296
TF_ALWAYS_INLINE void potential_eval_vec_4double_r(struct Potential *p[4], FPTYPE *r_in, FPTYPE *e, FPTYPE *f)
Evaluates the given potential at a set of points (interpolated).
Definition tf_potential_eval.h:1436
engine _Engine
@ POTENTIAL_SHIFTED
Definition tfPotential.h:92
@ POTENTIAL_SCALED
Definition tfPotential.h:87
@ POTENTIAL_PERIODIC
Definition tfPotential.h:104
TF_ALWAYS_INLINE void potential_eval_vec_4single(struct Potential *p[4], float *r2, float *e, float *f)
Evaluates the given potential at a set of points (interpolated).
Definition tf_potential_eval.h:367
TF_ALWAYS_INLINE void potential_eval_vec_4single_old(struct Potential *p[4], float *r2, float *e, float *f)
Evaluates the given potential at a set of points (interpolated).
Definition tf_potential_eval.h:506
TF_ALWAYS_INLINE void potential_eval_vec_8single(struct Potential *p[8], float *r2, float *e, float *f)
Evaluates the given potential at a set of points (interpolated).
Definition tf_potential_eval.h:809
A Potential object is a compiled interpolation of a given function. The Universe applies potentials t...
Definition tfPotential.h:213
uint32_t flags
Definition tfPotential.h:217
FPTYPE * c
Definition tfPotential.h:224
FPTYPE alpha[4]
Definition tfPotential.h:221
int n
Definition tfPotential.h:238
static const int max_type
Definition tfEngine.h:193
struct Potential ** p
Definition tfEngine.h:200