22#ifndef _MDCORE_SOURCE_TF_POTENTIAL_EVAL_H_
23#define _MDCORE_SOURCE_TF_POTENTIAL_EVAL_H_
27#include "tf_dpd_eval.h"
43 TF_ALWAYS_INLINE FPTYPE potential_eval_adjust_distance(
struct Potential *p, FPTYPE ri, FPTYPE rj, FPTYPE r) {
48 r = r - (ri + rj) + p->r0_plusone;
53 TF_ALWAYS_INLINE FPTYPE potential_eval_adjust_distance2(
struct Potential *p, FPTYPE ri, FPTYPE rj, FPTYPE r2) {
54 FPTYPE r = potential_eval_adjust_distance(p, ri, rj, FPTYPE_SQRT(r2));
77 FPTYPE x, ee, eff, *c, r;
83 ind = FPTYPE_FMAX(FPTYPE_ZERO, p->
alpha[0] + r * (p->
alpha[1] + r * p->
alpha[2]));
86 c = &(p->
c[ind * potential_chunk]);
89 x = (r - c[0]) * c[1];
94 for(k = 4 ; k < potential_chunk ; k++) {
100 *e = ee; *f = eff * c[1] / r;
104 TF_ALWAYS_INLINE
bool potential_eval_ex(
105 struct Potential *p, FPTYPE ri, FPTYPE rj, FPTYPE r2, FPTYPE *e, FPTYPE *f) {
108 FPTYPE x, ee, eff, *c, r, ro;
110 static const FPTYPE epsilon = std::numeric_limits<FPTYPE>::epsilon();
114 ro = r < epsilon ? epsilon : r;
116 r = potential_eval_adjust_distance(p, ri, rj, r);
119 r = r < p->a ? p->a : r;
122 ind = std::max(FPTYPE_ZERO, p->alpha[0] + r * (p->alpha[1] + r * p->alpha[2]));
124 if(r > p->b || ind > p->n) {
129 c = &(p->c[ind * potential_chunk]);
132 x = (r - c[0]) * c[1];
135 ee = c[2] * x + c[3];
137 for(k = 4 ; k < potential_chunk ; k++) {
143 *e = ee; *f = eff * c[1] / ro;
168 FPTYPE x, ee, eff, *c;
171 ind = FPTYPE_FMAX(FPTYPE_ZERO, p->
alpha[0] + r * (p->
alpha[1] + r * p->
alpha[2]));
178 c = &(p->
c[ind * potential_chunk]);
181 x = (r - c[0]) * c[1];
184 ee = c[2] * x + c[3];
186 for(k = 4 ; k < potential_chunk ; k++) {
192 *e = ee; *f = eff * c[1];
196 TF_ALWAYS_INLINE
bool potential_eval_super_ex(
const space_cell *cell,
197 Potential *pot, Particle *part_i, Particle *part_j,
198 FPTYPE *dx, FPTYPE r2, FPTYPE *epot);
200 static bool _potential_eval_super_ex(
const space_cell *cell,
201 Potential *pot, Particle *part_i, Particle *part_j,
202 FPTYPE *dx, FPTYPE r2, FPTYPE *epot)
204 return potential_eval_super_ex(cell, pot, part_i, part_j, dx, r2, epot);
207 bool potential_eval_super_ex(
const space_cell *cell,
208 Potential *pot, Particle *part_i, Particle *part_j,
209 FPTYPE *dx, FPTYPE r2, FPTYPE *epot) {
211 if(pot->kind == POTENTIAL_KIND_COMBINATION) {
212 if(pot->flags & POTENTIAL_SUM) {
213 return _potential_eval_super_ex(cell, pot->pca, part_i, part_j, dx, r2, epot) || _potential_eval_super_ex(cell, pot->pcb, part_i, part_j, dx, r2, epot);
227 for (
int k = 0; k < 3; k++) _dx[k] = dx[k];
230 if(pot->kind == POTENTIAL_KIND_DPD) {
232 if (dpd_eval((DPDPotential*)pot,
space_cell_gaussian(cell->id), part_i, part_j, _dx, _r2, &e)) {
239 else if(pot->kind == POTENTIAL_KIND_BYPARTICLES) {
240 FPTYPE fv[3] = {0., 0., 0.};
242 pot->eval_byparts(pot, part_i, part_j, _dx, _r2, &e, fv);
244 for (
int k = 0 ; k < 3 ; k++) {
245 part_i->f[k] += fv[k];
246 part_j->f[k] -= fv[k];
253 else if(pot->kind != POTENTIAL_KIND_COMBINATION) {
257 if (potential_eval_ex(pot, part_i->radius, part_j->radius, _r2, &e, &f)) {
259 for (
int k = 0 ; k < 3 ; k++) {
260 FPTYPE w = f * _dx[k];
298 const FPTYPE isqrtpi = 0.56418958354775628695;
299 const FPTYPE kappa = 3.0;
300 FPTYPE r = sqrt(r2), ir = 1.0 / r, ir2 = ir * ir, ir4, ir6, ir12, t1, t2;
301 FPTYPE ee = 0.0, eff = 0.0;
304 if(p->
flags & POTENTIAL_LJ126) {
307 ir4 = ir2 * ir2; ir6 = ir4 * ir2; ir12 = ir6 * ir6;
310 ee = (p->
alpha[0] * ir12 - p->
alpha[1] * ir6);
311 eff = 6.0 * ir * (-2.0 * p->
alpha[0] * ir12 + p->
alpha[1] * ir6);
316 if(p->
flags & POTENTIAL_EWALD) {
323 ee += p->
alpha[2] * t1 * ir;
324 eff += p->
alpha[2] * (-2.0 * isqrtpi * exp(-t2 * t2) * kappa * ir - t1 * ir2);
329 if(p->
flags & POTENTIAL_COULOMB) {
332 ee += potential_escale * p->
alpha[2] * ir;
333 eff += -potential_escale * p->
alpha[2] * ir2;
369 #if defined(__SSE__) && defined(FPTYPE_SINGLE)
376 } alpha[4], mi, hi, x, ee, eff, c[6], r, ind, t[8];
380 r.v = _mm_sqrt_ps(_mm_load_ps(r2));
383 alpha[0].v = _mm_load_ps(p[0]->alpha);
384 alpha[1].v = _mm_load_ps(p[1]->alpha);
385 alpha[2].v = _mm_load_ps(p[2]->alpha);
386 alpha[3].v = _mm_load_ps(p[3]->alpha);
387 t[0].m = _mm_unpacklo_epi32(alpha[0].m, alpha[1].m);
388 t[1].m = _mm_unpacklo_epi32(alpha[2].m, alpha[3].m);
389 t[2].m = _mm_unpackhi_epi32(alpha[0].m, alpha[1].m);
390 t[3].m = _mm_unpackhi_epi32(alpha[2].m, alpha[3].m);
391 alpha[0].m = _mm_unpacklo_epi64(t[0].m, t[1].m);
392 alpha[1].m = _mm_unpackhi_epi64(t[0].m, t[1].m);
393 alpha[2].m = _mm_unpacklo_epi64(t[2].m, t[3].m);
394 ind.m = _mm_cvttps_epi32(_mm_max_ps(_mm_setzero_ps(), _mm_add_ps(alpha[0].v, _mm_mul_ps(r.v, _mm_add_ps(alpha[1].v, _mm_mul_ps(r.v, alpha[2].v))))));
397 mi.v = _mm_load_ps(&p[0]->c[ ind.i[0] * potential_chunk ]);
398 hi.v = _mm_load_ps(&p[1]->c[ ind.i[1] * potential_chunk ]);
399 c[0].v = _mm_load_ps(&p[2]->c[ ind.i[2] * potential_chunk ]);
400 c[1].v = _mm_load_ps(&p[3]->c[ ind.i[3] * potential_chunk ]);
401 _MM_TRANSPOSE4_PS(mi.v, hi.v, c[0].v, c[1].v);
402 c[2].v = _mm_load_ps(&p[0]->c[ ind.i[0] * potential_chunk + 4 ]);
403 c[3].v = _mm_load_ps(&p[1]->c[ ind.i[1] * potential_chunk + 4 ]);
404 c[4].v = _mm_load_ps(&p[2]->c[ ind.i[2] * potential_chunk + 4 ]);
405 c[5].v = _mm_load_ps(&p[3]->c[ ind.i[3] * potential_chunk + 4 ]);
406 _MM_TRANSPOSE4_PS(c[2].v, c[3].v, c[4].v, c[5].v);
409 x.v = _mm_mul_ps(_mm_sub_ps(r.v, mi.v), hi.v);
413 ee.v = _mm_add_ps(_mm_mul_ps(eff.v, x.v), c[1].v);
414 eff.v = _mm_add_ps(_mm_mul_ps(eff.v, x.v), ee.v);
415 ee.v = _mm_add_ps(_mm_mul_ps(ee.v, x.v), c[2].v);
416 eff.v = _mm_add_ps(_mm_mul_ps(eff.v, x.v), ee.v);
417 ee.v = _mm_add_ps(_mm_mul_ps(ee.v, x.v), c[3].v);
418 eff.v = _mm_add_ps(_mm_mul_ps(eff.v, x.v), ee.v);
419 ee.v = _mm_add_ps(_mm_mul_ps(ee.v, x.v), c[4].v);
420 eff.v = _mm_add_ps(_mm_mul_ps(eff.v, x.v), ee.v);
421 ee.v = _mm_add_ps(_mm_mul_ps(ee.v, x.v), c[5].v);
424 _mm_store_ps(e, ee.v);
425 _mm_store_ps(f, _mm_mul_ps(eff.v, _mm_div_ps(hi.v, r.v)));
427 #elif defined(__ALTIVEC__) && defined(FPTYPE_SINGLE)
432 } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r;
434 vector
unsigned int v;
440 r.v = vec_sqrt(*((vector
float *)r2));
443 alpha0.v = vec_load4(p[0]->alpha[0], p[1]->alpha[0], p[2]->alpha[0], p[3]->alpha[0]);
444 alpha1.v = vec_load4(p[0]->alpha[1], p[1]->alpha[1], p[2]->alpha[1], p[3]->alpha[1]);
445 alpha2.v = vec_load4(p[0]->alpha[2], p[1]->alpha[2], p[2]->alpha[2], p[3]->alpha[2]);
446 ind.v = vec_ctu(vec_madd(r.v, vec_madd(r.v, alpha2.v, alpha1.v), alpha0.v), 0);
449 for(k = 0 ; k < 4 ; k++)
450 data[k] = &(p[k]->c[ ind.i[k] * potential_chunk ]);
453 mi.v = vec_load4(data[0][0], data[1][0], data[2][0], data[3][0]);
454 hi.v = vec_load4(data[0][1], data[1][1], data[2][1], data[3][1]);
455 x.v = vec_mul(vec_sub(r.v, mi.v), hi.v);
458 eff.v = vec_load4(data[0][2], data[1][2], data[2][2], data[3][2]);
459 c.v = vec_load4(data[0][3], data[1][3], data[2][3], data[3][3]);
460 ee.v = vec_madd(eff.v, x.v, c.v);
461 for(j = 4 ; j < potential_chunk ; j++) {
462 c.v = vec_load4(data[0][j], data[1][j], data[2][j], data[3][j]);
463 eff.v = vec_madd(eff.v, x.v, ee.v);
464 ee.v = vec_madd(ee.v, x.v, c.v);
468 *((vector
float *)e) = ee.v;
469 *((vector
float *)f) = vec_mul(eff.v, vec_div(hi.v, r.v));
474 for(k = 0 ; k < 4 ; k++) {
476 e[k] = ee; f[k] = eff;
508 #if defined(__SSE__) && defined(FPTYPE_SINGLE)
515 } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r, ind;
519 r.v = _mm_sqrt_ps(_mm_load_ps(r2));
522 alpha0.v = _mm_setr_ps(p[0]->alpha[0], p[1]->alpha[0], p[2]->alpha[0], p[3]->alpha[0]);
523 alpha1.v = _mm_setr_ps(p[0]->alpha[1], p[1]->alpha[1], p[2]->alpha[1], p[3]->alpha[1]);
524 alpha2.v = _mm_setr_ps(p[0]->alpha[2], p[1]->alpha[2], p[2]->alpha[2], p[3]->alpha[2]);
525 ind.m = _mm_cvttps_epi32(_mm_max_ps(_mm_setzero_ps(), _mm_add_ps(alpha0.v, _mm_mul_ps(r.v, _mm_add_ps(alpha1.v, _mm_mul_ps(r.v, alpha2.v))))));
528 for(k = 0 ; k < 4 ; k++)
529 data[k] = &(p[k]->c[ ind.i[k] * potential_chunk ]);
532 mi.v = _mm_setr_ps(data[0][0], data[1][0], data[2][0], data[3][0]);
533 hi.v = _mm_setr_ps(data[0][1], data[1][1], data[2][1], data[3][1]);
534 x.v = _mm_mul_ps(_mm_sub_ps(r.v, mi.v), hi.v);
537 eff.v = _mm_setr_ps(data[0][2], data[1][2], data[2][2], data[3][2]);
538 c.v = _mm_setr_ps(data[0][3], data[1][3], data[2][3], data[3][3]);
539 ee.v = _mm_add_ps(_mm_mul_ps(eff.v, x.v), c.v);
540 for(j = 4 ; j < potential_chunk ; j++) {
541 c.v = _mm_setr_ps(data[0][j], data[1][j], data[2][j], data[3][j]);
542 eff.v = _mm_add_ps(_mm_mul_ps(eff.v, x.v), ee.v);
543 ee.v = _mm_add_ps(_mm_mul_ps(ee.v, x.v), c.v);
547 _mm_store_ps(e, ee.v);
548 _mm_store_ps(f, _mm_mul_ps(eff.v, _mm_div_ps(hi.v, r.v)));
550 #elif defined(__ALTIVEC__) && defined(FPTYPE_SINGLE)
555 } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r;
557 vector
unsigned int v;
563 r.v = vec_sqrt(*((vector
float *)r2));
566 alpha0.v = vec_load4(p[0]->alpha[0], p[1]->alpha[0], p[2]->alpha[0], p[3]->alpha[0]);
567 alpha1.v = vec_load4(p[0]->alpha[1], p[1]->alpha[1], p[2]->alpha[1], p[3]->alpha[1]);
568 alpha2.v = vec_load4(p[0]->alpha[2], p[1]->alpha[2], p[2]->alpha[2], p[3]->alpha[2]);
569 ind.v = vec_ctu(vec_madd(r.v, vec_madd(r.v, alpha2.v, alpha1.v), alpha0.v), 0);
572 for(k = 0 ; k < 4 ; k++)
573 data[k] = &(p[k]->c[ ind.i[k] * potential_chunk ]);
576 mi.v = vec_load4(data[0][0], data[1][0], data[2][0], data[3][0]);
577 hi.v = vec_load4(data[0][1], data[1][1], data[2][1], data[3][1]);
578 x.v = vec_mul(vec_sub(r.v, mi.v), hi.v);
581 eff.v = vec_load4(data[0][2], data[1][2], data[2][2], data[3][2]);
582 c.v = vec_load4(data[0][3], data[1][3], data[2][3], data[3][3]);
583 ee.v = vec_madd(eff.v, x.v, c.v);
584 for(j = 4 ; j < potential_chunk ; j++) {
585 c.v = vec_load4(data[0][j], data[1][j], data[2][j], data[3][j]);
586 eff.v = vec_madd(eff.v, x.v, ee.v);
587 ee.v = vec_madd(ee.v, x.v, c.v);
591 *((vector
float *)e) = ee.v;
592 *((vector
float *)f) = vec_mul(eff.v, vec_div(hi.v, r.v));
597 for(k = 0 ; k < 4 ; k++) {
599 e[k] = ee; f[k] = eff;
606 #ifdef STILL_NOT_READY_FOR_PRIME_TIME
607 TF_ALWAYS_INLINE
void potential_eval_vec_4single_gccvec(
struct Potential *p[4],
float *r2,
float *e,
float *f) {
613 } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r;
621 r.v = sqrtf(*((vector(4,
float) *)r2));
624 for(k = 0 ; k < 4 ; k++) {
625 alpha0.f[k] = p[k]->alpha[0];
626 alpha1.f[k] = p[k]->alpha[1];
627 alpha2.f[k] = p[k]->alpha[2];
629 ind.v = max((vector(4,
int)){0,0,0,0}, (vector(4,
int))(alpha0.v + r.v*(alpha1.v + r.v*alpha2.v)));
632 for(k = 0 ; k < 4 ; k++)
633 data[k] = &(p[k]->c[ ind.i[k] * potential_chunk ]);
636 for(k = 0 ; k < 4 ; k++) {
637 mi.f[k] = data[k][0];
638 hi.f[k] = data[k][1];
640 x.v = (r.v - mi.v) * hi.v;
643 for(k = 0 ; k < 4 ; k++) {
644 eff.f[k] = data[k][2];
647 ee.v = eff.v*x.v + c.v;
648 for(j = 4 ; j < potential_chunk ; j++) {
649 for(k = 0 ; k < 4 ; k++)
651 eff.v = eff.v*x.v + ee.v;
652 ee.v = ee.v*x.v + c.v;
656 *((vector(4,
float) *)e) = ee.v;
657 *((vector(4,
float) *)f) = eff.v*(hi.v / r.v);
688 #if defined(__SSE__) && defined(FPTYPE_SINGLE)
695 } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r, ind;
699 r.v = _mm_load_ps(r_in);
702 alpha0.v = _mm_setr_ps(p[0]->alpha[0], p[1]->alpha[0], p[2]->alpha[0], p[3]->alpha[0]);
703 alpha1.v = _mm_setr_ps(p[0]->alpha[1], p[1]->alpha[1], p[2]->alpha[1], p[3]->alpha[1]);
704 alpha2.v = _mm_setr_ps(p[0]->alpha[2], p[1]->alpha[2], p[2]->alpha[2], p[3]->alpha[2]);
705 ind.m = _mm_cvttps_epi32(_mm_max_ps(_mm_setzero_ps(), _mm_add_ps(alpha0.v, _mm_mul_ps(r.v, _mm_add_ps(alpha1.v, _mm_mul_ps(r.v, alpha2.v))))));
708 for(k = 0 ; k < 4 ; k++)
709 data[k] = &(p[k]->c[ ind.i[k] * potential_chunk ]);
712 mi.v = _mm_setr_ps(data[0][0], data[1][0], data[2][0], data[3][0]);
713 hi.v = _mm_setr_ps(data[0][1], data[1][1], data[2][1], data[3][1]);
714 x.v = _mm_mul_ps(_mm_sub_ps(r.v, mi.v), hi.v);
717 eff.v = _mm_setr_ps(data[0][2], data[1][2], data[2][2], data[3][2]);
718 c.v = _mm_setr_ps(data[0][3], data[1][3], data[2][3], data[3][3]);
719 ee.v = _mm_add_ps(_mm_mul_ps(eff.v, x.v), c.v);
720 for(j = 4 ; j < potential_chunk ; j++) {
721 c.v = _mm_setr_ps(data[0][j], data[1][j], data[2][j], data[3][j]);
722 eff.v = _mm_add_ps(_mm_mul_ps(eff.v, x.v), ee.v);
723 ee.v = _mm_add_ps(_mm_mul_ps(ee.v, x.v), c.v);
727 _mm_store_ps(e, ee.v);
728 _mm_store_ps(f, _mm_mul_ps(eff.v, hi.v));
730 #elif defined(__ALTIVEC__) && defined(FPTYPE_SINGLE)
735 } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r;
737 vector
unsigned int v;
743 r.v = *((vector
float *)r_in);
746 alpha0.v = vec_load4(p[0]->alpha[0], p[1]->alpha[0], p[2]->alpha[0], p[3]->alpha[0]);
747 alpha1.v = vec_load4(p[0]->alpha[1], p[1]->alpha[1], p[2]->alpha[1], p[3]->alpha[1]);
748 alpha2.v = vec_load4(p[0]->alpha[2], p[1]->alpha[2], p[2]->alpha[2], p[3]->alpha[2]);
749 ind.v = vec_ctu(vec_madd(r.v, vec_madd(r.v, alpha2.v, alpha1.v), alpha0.v), 0);
752 for(k = 0 ; k < 4 ; k++)
753 data[k] = &(p[k]->c[ ind.i[k] * potential_chunk ]);
756 mi.v = vec_load4(data[0][0], data[1][0], data[2][0], data[3][0]);
757 hi.v = vec_load4(data[0][1], data[1][1], data[2][1], data[3][1]);
758 x.v = vec_mul(vec_sub(r.v, mi.v), hi.v);
761 eff.v = vec_load4(data[0][2], data[1][2], data[2][2], data[3][2]);
762 c.v = vec_load4(data[0][3], data[1][3], data[2][3], data[3][3]);
763 ee.v = vec_madd(eff.v, x.v, c.v);
764 for(j = 4 ; j < potential_chunk ; j++) {
765 c.v = vec_load4(data[0][j], data[1][j], data[2][j], data[3][j]);
766 eff.v = vec_madd(eff.v, x.v, ee.v);
767 ee.v = vec_madd(ee.v, x.v, c.v);
771 *((vector
float *)e) = ee.v;
772 *((vector
float *)f) = vec_mul(eff.v, hi.v);
777 for(k = 0 ; k < 4 ; k++) {
779 e[k] = ee; f[k] = eff;
811 #if defined(__AVX__) && defined(FPTYPE_SINGLE)
818 } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r, ind;
822 r.v = _mm256_sqrt_ps(_mm256_load_ps(r2));
825 alpha0.v = _mm256_setr_ps(p[0]->alpha[0], p[1]->alpha[0], p[2]->alpha[0], p[3]->alpha[0], p[4]->alpha[0], p[5]->alpha[0], p[6]->alpha[0], p[7]->alpha[0]);
826 alpha1.v = _mm256_setr_ps(p[0]->alpha[1], p[1]->alpha[1], p[2]->alpha[1], p[3]->alpha[1], p[4]->alpha[1], p[5]->alpha[1], p[6]->alpha[1], p[7]->alpha[1]);
827 alpha2.v = _mm256_setr_ps(p[0]->alpha[2], p[1]->alpha[2], p[2]->alpha[2], p[3]->alpha[2], p[4]->alpha[2], p[5]->alpha[2], p[6]->alpha[2], p[7]->alpha[2]);
828 ind.m = _mm256_cvttps_epi32(_mm256_max_ps(_mm256_setzero_ps(), _mm256_add_ps(alpha0.v, _mm256_mul_ps(r.v, _mm256_add_ps(alpha1.v, _mm256_mul_ps(r.v, alpha2.v))))));
831 data[0] = &(p[0]->
c[ ind.i[0] * potential_chunk ]);
832 data[1] = &(p[1]->
c[ ind.i[1] * potential_chunk ]);
833 data[2] = &(p[2]->
c[ ind.i[2] * potential_chunk ]);
834 data[3] = &(p[3]->
c[ ind.i[3] * potential_chunk ]);
835 data[4] = &(p[4]->
c[ ind.i[4] * potential_chunk ]);
836 data[5] = &(p[5]->
c[ ind.i[5] * potential_chunk ]);
837 data[6] = &(p[6]->
c[ ind.i[6] * potential_chunk ]);
838 data[7] = &(p[7]->
c[ ind.i[7] * potential_chunk ]);
841 mi.v = _mm256_setr_ps(data[0][0], data[1][0], data[2][0], data[3][0], data[4][0], data[5][0], data[6][0], data[7][0]);
842 hi.v = _mm256_setr_ps(data[0][1], data[1][1], data[2][1], data[3][1], data[4][1], data[5][1], data[6][1], data[7][1]);
843 x.v = _mm256_mul_ps(_mm256_sub_ps(r.v, mi.v), hi.v);
846 eff.v = _mm256_setr_ps(data[0][2], data[1][2], data[2][2], data[3][2], data[4][2], data[5][2], data[6][2], data[7][2]);
847 c.v = _mm256_setr_ps(data[0][3], data[1][3], data[2][3], data[3][3], data[4][3], data[5][3], data[6][3], data[7][3]);
848 ee.v = _mm256_add_ps(_mm256_mul_ps(eff.v, x.v), c.v);
849 for(j = 4 ; j < potential_chunk ; j++) {
850 c.v = _mm256_setr_ps(data[0][j], data[1][j], data[2][j], data[3][j], data[4][j], data[5][j], data[6][j], data[7][j]);
851 eff.v = _mm256_add_ps(_mm256_mul_ps(eff.v, x.v), ee.v);
852 ee.v = _mm256_add_ps(_mm256_mul_ps(ee.v, x.v), c.v);
856 _mm256_store_ps(e, ee.v);
857 _mm256_store_ps(f, _mm256_mul_ps(eff.v, _mm256_div_ps(hi.v, r.v)));
859 #elif defined(__SSE__) && defined(FPTYPE_SINGLE)
866 } alpha0_1, alpha1_1, alpha2_1, mi_1, hi_1, x_1, ee_1, eff_1, c_1, r_1, ind_1,
867 alpha0_2, alpha1_2, alpha2_2, mi_2, hi_2, x_2, ee_2, eff_2, c_2, r_2, ind_2;
871 r_1.v = _mm_sqrt_ps(_mm_load_ps(&r2[0]));
872 r_2.v = _mm_sqrt_ps(_mm_load_ps(&r2[4]));
875 alpha0_1.v = _mm_setr_ps(p[0]->alpha[0], p[1]->alpha[0], p[2]->alpha[0], p[3]->alpha[0]);
876 alpha1_1.v = _mm_setr_ps(p[0]->alpha[1], p[1]->alpha[1], p[2]->alpha[1], p[3]->alpha[1]);
877 alpha2_1.v = _mm_setr_ps(p[0]->alpha[2], p[1]->alpha[2], p[2]->alpha[2], p[3]->alpha[2]);
878 alpha0_2.v = _mm_setr_ps(p[4]->alpha[0], p[5]->alpha[0], p[6]->alpha[0], p[7]->alpha[0]);
879 alpha1_2.v = _mm_setr_ps(p[4]->alpha[1], p[5]->alpha[1], p[6]->alpha[1], p[7]->alpha[1]);
880 alpha2_2.v = _mm_setr_ps(p[4]->alpha[2], p[5]->alpha[2], p[6]->alpha[2], p[7]->alpha[2]);
881 ind_1.m = _mm_cvttps_epi32(_mm_max_ps(_mm_setzero_ps(), _mm_add_ps(alpha0_1.v, _mm_mul_ps(r_1.v, _mm_add_ps(alpha1_1.v, _mm_mul_ps(r_1.v, alpha2_1.v))))));
882 ind_2.m = _mm_cvttps_epi32(_mm_max_ps(_mm_setzero_ps(), _mm_add_ps(alpha0_2.v, _mm_mul_ps(r_2.v, _mm_add_ps(alpha1_2.v, _mm_mul_ps(r_2.v, alpha2_2.v))))));
885 data[0] = &(p[0]->
c[ ind_1.i[0] * potential_chunk ]);
886 data[1] = &(p[1]->
c[ ind_1.i[1] * potential_chunk ]);
887 data[2] = &(p[2]->
c[ ind_1.i[2] * potential_chunk ]);
888 data[3] = &(p[3]->
c[ ind_1.i[3] * potential_chunk ]);
889 data[4] = &(p[4]->
c[ ind_2.i[0] * potential_chunk ]);
890 data[5] = &(p[5]->
c[ ind_2.i[1] * potential_chunk ]);
891 data[6] = &(p[6]->
c[ ind_2.i[2] * potential_chunk ]);
892 data[7] = &(p[7]->
c[ ind_2.i[3] * potential_chunk ]);
895 mi_1.v = _mm_setr_ps(data[0][0], data[1][0], data[2][0], data[3][0]);
896 hi_1.v = _mm_setr_ps(data[0][1], data[1][1], data[2][1], data[3][1]);
897 mi_2.v = _mm_setr_ps(data[4][0], data[5][0], data[6][0], data[7][0]);
898 hi_2.v = _mm_setr_ps(data[4][1], data[5][1], data[6][1], data[7][1]);
899 x_1.v = _mm_mul_ps(_mm_sub_ps(r_1.v, mi_1.v), hi_1.v);
900 x_2.v = _mm_mul_ps(_mm_sub_ps(r_2.v, mi_2.v), hi_2.v);
903 eff_1.v = _mm_setr_ps(data[0][2], data[1][2], data[2][2], data[3][2]);
904 eff_2.v = _mm_setr_ps(data[4][2], data[5][2], data[6][2], data[7][2]);
905 c_1.v = _mm_setr_ps(data[0][3], data[1][3], data[2][3], data[3][3]);
906 c_2.v = _mm_setr_ps(data[4][3], data[5][3], data[6][3], data[7][3]);
907 ee_1.v = _mm_add_ps(_mm_mul_ps(eff_1.v, x_1.v), c_1.v);
908 ee_2.v = _mm_add_ps(_mm_mul_ps(eff_2.v, x_2.v), c_2.v);
909 for(j = 4 ; j < potential_chunk ; j++) {
910 c_1.v = _mm_setr_ps(data[0][j], data[1][j], data[2][j], data[3][j]);
911 c_2.v = _mm_setr_ps(data[4][j], data[5][j], data[6][j], data[7][j]);
912 eff_1.v = _mm_add_ps(_mm_mul_ps(eff_1.v, x_1.v), ee_1.v);
913 eff_2.v = _mm_add_ps(_mm_mul_ps(eff_2.v, x_2.v), ee_2.v);
914 ee_1.v = _mm_add_ps(_mm_mul_ps(ee_1.v, x_1.v), c_1.v);
915 ee_2.v = _mm_add_ps(_mm_mul_ps(ee_2.v, x_2.v), c_2.v);
919 _mm_store_ps(&e[0], ee_1.v);
920 _mm_store_ps(&e[4], ee_2.v);
921 _mm_store_ps(&f[0], _mm_mul_ps(eff_1.v, _mm_div_ps(hi_1.v, r_1.v)));
922 _mm_store_ps(&f[4], _mm_mul_ps(eff_2.v, _mm_div_ps(hi_2.v, r_2.v)));
924 #elif defined(__ALTIVEC__) && defined(FPTYPE_SINGLE)
928 vector
unsigned int m;
931 } alpha0_1, alpha1_1, alpha2_1, mi_1, hi_1, x_1, ee_1, eff_1, c_1, r_1, ind_1,
932 alpha0_2, alpha1_2, alpha2_2, mi_2, hi_2, x_2, ee_2, eff_2, c_2, r_2, ind_2;
936 r_1.v = vec_sqrt(*((vector
float *)&r2[0]));
937 r_2.v = vec_sqrt(*((vector
float *)&r2[4]));
940 alpha0_1.v = vec_load4(p[0]->alpha[0], p[1]->alpha[0], p[2]->alpha[0], p[3]->alpha[0]);
941 alpha1_1.v = vec_load4(p[0]->alpha[1], p[1]->alpha[1], p[2]->alpha[1], p[3]->alpha[1]);
942 alpha2_1.v = vec_load4(p[0]->alpha[2], p[1]->alpha[2], p[2]->alpha[2], p[3]->alpha[2]);
943 alpha0_2.v = vec_load4(p[4]->alpha[0], p[5]->alpha[0], p[6]->alpha[0], p[7]->alpha[0]);
944 alpha1_2.v = vec_load4(p[4]->alpha[1], p[5]->alpha[1], p[6]->alpha[1], p[7]->alpha[1]);
945 alpha2_2.v = vec_load4(p[4]->alpha[2], p[5]->alpha[2], p[6]->alpha[2], p[7]->alpha[2]);
946 ind_1.m = vec_ctu(vec_madd(r_1.v, vec_madd(r_1.v, alpha2_1.v, alpha1_1.v), alpha0_1.v), 0);
947 ind_2.m = vec_ctu(vec_madd(r_2.v, vec_madd(r_2.v, alpha2_2.v, alpha1_2.v), alpha0_2.v), 0);
950 data[0] = &(p[0]->
c[ ind_1.i[0] * potential_chunk ]);
951 data[1] = &(p[1]->
c[ ind_1.i[1] * potential_chunk ]);
952 data[2] = &(p[2]->
c[ ind_1.i[2] * potential_chunk ]);
953 data[3] = &(p[3]->
c[ ind_1.i[3] * potential_chunk ]);
954 data[4] = &(p[4]->
c[ ind_2.i[0] * potential_chunk ]);
955 data[5] = &(p[5]->
c[ ind_2.i[1] * potential_chunk ]);
956 data[6] = &(p[6]->
c[ ind_2.i[2] * potential_chunk ]);
957 data[7] = &(p[7]->
c[ ind_2.i[3] * potential_chunk ]);
960 mi_1.v = vec_load4(data[0][0], data[1][0], data[2][0], data[3][0]);
961 hi_1.v = vec_load4(data[0][1], data[1][1], data[2][1], data[3][1]);
962 mi_2.v = vec_load4(data[4][0], data[5][0], data[6][0], data[7][0]);
963 hi_2.v = vec_load4(data[4][1], data[5][1], data[6][1], data[7][1]);
964 x_1.v = vec_mul(vec_sub(r_1.v, mi_1.v), hi_1.v);
965 x_2.v = vec_mul(vec_sub(r_2.v, mi_2.v), hi_2.v);
968 eff_1.v = vec_load4(data[0][2], data[1][2], data[2][2], data[3][2]);
969 eff_2.v = vec_load4(data[4][2], data[5][2], data[6][2], data[7][2]);
970 c_1.v = vec_load4(data[0][3], data[1][3], data[2][3], data[3][3]);
971 c_2.v = vec_load4(data[4][3], data[5][3], data[6][3], data[7][3]);
972 ee_1.v = vec_madd(eff_1.v, x_1.v, c_1.v);
973 ee_2.v = vec_madd(eff_2.v, x_2.v, c_2.v);
974 for(j = 4 ; j < potential_chunk ; j++) {
975 c_1.v = vec_load4(data[0][j], data[1][j], data[2][j], data[3][j]);
976 c_2.v = vec_load4(data[4][j], data[5][j], data[6][j], data[7][j]);
977 eff_1.v = vec_madd(eff_1.v, x_1.v, ee_1.v);
978 eff_2.v = vec_madd(eff_2.v, x_2.v, ee_2.v);
979 ee_1.v = vec_madd(ee_1.v, x_1.v, c_1.v);
980 ee_2.v = vec_madd(ee_2.v, x_2.v, c_2.v);
984 eff_1.v = vec_mul(eff_1.v, vec_div(hi_1.v, r_1.v));
985 eff_2.v = vec_mul(eff_2.v, vec_div(hi_2.v, r_2.v));
986 memcpy(&e[0], &ee_1,
sizeof(vector
float));
987 memcpy(&f[0], &eff_1,
sizeof(vector
float));
988 memcpy(&e[4], &ee_2,
sizeof(vector
float));
989 memcpy(&f[4], &eff_2,
sizeof(vector
float));
994 for(k = 0 ; k < 8 ; k++) {
996 e[k] = ee; f[k] = eff;
1003 TF_ALWAYS_INLINE
void potential_eval_vec_8single_r(
struct Potential *p[8],
float *r2,
float *e,
float *f) {
1005 #if defined(__AVX__) && defined(FPTYPE_SINGLE)
1012 } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r, ind;
1016 r.v = _mm256_load_ps(r2);
1019 alpha0.v = _mm256_setr_ps(p[0]->alpha[0], p[1]->alpha[0], p[2]->alpha[0], p[3]->alpha[0], p[4]->alpha[0], p[5]->alpha[0], p[6]->alpha[0], p[7]->alpha[0]);
1020 alpha1.v = _mm256_setr_ps(p[0]->alpha[1], p[1]->alpha[1], p[2]->alpha[1], p[3]->alpha[1], p[4]->alpha[1], p[5]->alpha[1], p[6]->alpha[1], p[7]->alpha[1]);
1021 alpha2.v = _mm256_setr_ps(p[0]->alpha[2], p[1]->alpha[2], p[2]->alpha[2], p[3]->alpha[2], p[4]->alpha[2], p[5]->alpha[2], p[6]->alpha[2], p[7]->alpha[2]);
1022 ind.m = _mm256_cvttps_epi32(_mm256_max_ps(_mm256_setzero_ps(), _mm256_add_ps(alpha0.v, _mm256_mul_ps(r.v, _mm256_add_ps(alpha1.v, _mm256_mul_ps(r.v, alpha2.v))))));
1025 data[0] = &(p[0]->c[ ind.i[0] * potential_chunk ]);
1026 data[1] = &(p[1]->c[ ind.i[1] * potential_chunk ]);
1027 data[2] = &(p[2]->c[ ind.i[2] * potential_chunk ]);
1028 data[3] = &(p[3]->c[ ind.i[3] * potential_chunk ]);
1029 data[4] = &(p[4]->c[ ind.i[4] * potential_chunk ]);
1030 data[5] = &(p[5]->c[ ind.i[5] * potential_chunk ]);
1031 data[6] = &(p[6]->c[ ind.i[6] * potential_chunk ]);
1032 data[7] = &(p[7]->c[ ind.i[7] * potential_chunk ]);
1035 mi.v = _mm256_setr_ps(data[0][0], data[1][0], data[2][0], data[3][0], data[4][0], data[5][0], data[6][0], data[7][0]);
1036 hi.v = _mm256_setr_ps(data[0][1], data[1][1], data[2][1], data[3][1], data[4][1], data[5][1], data[6][1], data[7][1]);
1037 x.v = _mm256_mul_ps(_mm256_sub_ps(r.v, mi.v), hi.v);
1040 eff.v = _mm256_setr_ps(data[0][2], data[1][2], data[2][2], data[3][2], data[4][2], data[5][2], data[6][2], data[7][2]);
1041 c.v = _mm256_setr_ps(data[0][3], data[1][3], data[2][3], data[3][3], data[4][3], data[5][3], data[6][3], data[7][3]);
1042 ee.v = _mm256_add_ps(_mm256_mul_ps(eff.v, x.v), c.v);
1043 for(j = 4 ; j < potential_chunk ; j++) {
1044 c.v = _mm256_setr_ps(data[0][j], data[1][j], data[2][j], data[3][j], data[4][j], data[5][j], data[6][j], data[7][j]);
1045 eff.v = _mm256_add_ps(_mm256_mul_ps(eff.v, x.v), ee.v);
1046 ee.v = _mm256_add_ps(_mm256_mul_ps(ee.v, x.v), c.v);
1050 _mm256_store_ps(e, ee.v);
1051 _mm256_store_ps(f, _mm256_mul_ps(eff.v, hi.v));
1053 #elif defined(__SSE__) && defined(FPTYPE_SINGLE)
1060 } alpha0_1, alpha1_1, alpha2_1, mi_1, hi_1, x_1, ee_1, eff_1, c_1, r_1, ind_1,
1061 alpha0_2, alpha1_2, alpha2_2, mi_2, hi_2, x_2, ee_2, eff_2, c_2, r_2, ind_2;
1065 r_1.v = _mm_load_ps(&r2[0]);
1066 r_2.v = _mm_load_ps(&r2[4]);
1069 alpha0_1.v = _mm_setr_ps(p[0]->alpha[0], p[1]->alpha[0], p[2]->alpha[0], p[3]->alpha[0]);
1070 alpha1_1.v = _mm_setr_ps(p[0]->alpha[1], p[1]->alpha[1], p[2]->alpha[1], p[3]->alpha[1]);
1071 alpha2_1.v = _mm_setr_ps(p[0]->alpha[2], p[1]->alpha[2], p[2]->alpha[2], p[3]->alpha[2]);
1072 alpha0_2.v = _mm_setr_ps(p[4]->alpha[0], p[5]->alpha[0], p[6]->alpha[0], p[7]->alpha[0]);
1073 alpha1_2.v = _mm_setr_ps(p[4]->alpha[1], p[5]->alpha[1], p[6]->alpha[1], p[7]->alpha[1]);
1074 alpha2_2.v = _mm_setr_ps(p[4]->alpha[2], p[5]->alpha[2], p[6]->alpha[2], p[7]->alpha[2]);
1075 ind_1.m = _mm_cvttps_epi32(_mm_max_ps(_mm_setzero_ps(), _mm_add_ps(alpha0_1.v, _mm_mul_ps(r_1.v, _mm_add_ps(alpha1_1.v, _mm_mul_ps(r_1.v, alpha2_1.v))))));
1076 ind_2.m = _mm_cvttps_epi32(_mm_max_ps(_mm_setzero_ps(), _mm_add_ps(alpha0_2.v, _mm_mul_ps(r_2.v, _mm_add_ps(alpha1_2.v, _mm_mul_ps(r_2.v, alpha2_2.v))))));
1079 data[0] = &(p[0]->c[ ind_1.i[0] * potential_chunk ]);
1080 data[1] = &(p[1]->c[ ind_1.i[1] * potential_chunk ]);
1081 data[2] = &(p[2]->c[ ind_1.i[2] * potential_chunk ]);
1082 data[3] = &(p[3]->c[ ind_1.i[3] * potential_chunk ]);
1083 data[4] = &(p[4]->c[ ind_2.i[0] * potential_chunk ]);
1084 data[5] = &(p[5]->c[ ind_2.i[1] * potential_chunk ]);
1085 data[6] = &(p[6]->c[ ind_2.i[2] * potential_chunk ]);
1086 data[7] = &(p[7]->c[ ind_2.i[3] * potential_chunk ]);
1089 mi_1.v = _mm_setr_ps(data[0][0], data[1][0], data[2][0], data[3][0]);
1090 hi_1.v = _mm_setr_ps(data[0][1], data[1][1], data[2][1], data[3][1]);
1091 mi_2.v = _mm_setr_ps(data[4][0], data[5][0], data[6][0], data[7][0]);
1092 hi_2.v = _mm_setr_ps(data[4][1], data[5][1], data[6][1], data[7][1]);
1093 x_1.v = _mm_mul_ps(_mm_sub_ps(r_1.v, mi_1.v), hi_1.v);
1094 x_2.v = _mm_mul_ps(_mm_sub_ps(r_2.v, mi_2.v), hi_2.v);
1097 eff_1.v = _mm_setr_ps(data[0][2], data[1][2], data[2][2], data[3][2]);
1098 eff_2.v = _mm_setr_ps(data[4][2], data[5][2], data[6][2], data[7][2]);
1099 c_1.v = _mm_setr_ps(data[0][3], data[1][3], data[2][3], data[3][3]);
1100 c_2.v = _mm_setr_ps(data[4][3], data[5][3], data[6][3], data[7][3]);
1101 ee_1.v = _mm_add_ps(_mm_mul_ps(eff_1.v, x_1.v), c_1.v);
1102 ee_2.v = _mm_add_ps(_mm_mul_ps(eff_2.v, x_2.v), c_2.v);
1103 for(j = 4 ; j < potential_chunk ; j++) {
1104 c_1.v = _mm_setr_ps(data[0][j], data[1][j], data[2][j], data[3][j]);
1105 c_2.v = _mm_setr_ps(data[4][j], data[5][j], data[6][j], data[7][j]);
1106 eff_1.v = _mm_add_ps(_mm_mul_ps(eff_1.v, x_1.v), ee_1.v);
1107 eff_2.v = _mm_add_ps(_mm_mul_ps(eff_2.v, x_2.v), ee_2.v);
1108 ee_1.v = _mm_add_ps(_mm_mul_ps(ee_1.v, x_1.v), c_1.v);
1109 ee_2.v = _mm_add_ps(_mm_mul_ps(ee_2.v, x_2.v), c_2.v);
1113 _mm_store_ps(&e[0], ee_1.v);
1114 _mm_store_ps(&e[4], ee_2.v);
1115 _mm_store_ps(&f[0], _mm_mul_ps(eff_1.v, hi_1.v));
1116 _mm_store_ps(&f[4], _mm_mul_ps(eff_2.v, hi_2.v));
1118 #elif defined(__ALTIVEC__) && defined(FPTYPE_SINGLE)
1122 vector
unsigned int m;
1125 } alpha0_1, alpha1_1, alpha2_1, mi_1, hi_1, x_1, ee_1, eff_1, c_1, r_1, ind_1,
1126 alpha0_2, alpha1_2, alpha2_2, mi_2, hi_2, x_2, ee_2, eff_2, c_2, r_2, ind_2;
1130 r_1.v = *((vector
float *)&r2[0]);
1131 r_2.v = *((vector
float *)&r2[4]);
1134 alpha0_1.v = vec_load4(p[0]->alpha[0], p[1]->alpha[0], p[2]->alpha[0], p[3]->alpha[0]);
1135 alpha1_1.v = vec_load4(p[0]->alpha[1], p[1]->alpha[1], p[2]->alpha[1], p[3]->alpha[1]);
1136 alpha2_1.v = vec_load4(p[0]->alpha[2], p[1]->alpha[2], p[2]->alpha[2], p[3]->alpha[2]);
1137 alpha0_2.v = vec_load4(p[4]->alpha[0], p[5]->alpha[0], p[6]->alpha[0], p[7]->alpha[0]);
1138 alpha1_2.v = vec_load4(p[4]->alpha[1], p[5]->alpha[1], p[6]->alpha[1], p[7]->alpha[1]);
1139 alpha2_2.v = vec_load4(p[4]->alpha[2], p[5]->alpha[2], p[6]->alpha[2], p[7]->alpha[2]);
1140 ind_1.m = vec_ctu(vec_madd(r_1.v, vec_madd(r_1.v, alpha2_1.v, alpha1_1.v), alpha0_1.v), 0);
1141 ind_2.m = vec_ctu(vec_madd(r_2.v, vec_madd(r_2.v, alpha2_2.v, alpha1_2.v), alpha0_2.v), 0);
1144 data[0] = &(p[0]->c[ ind_1.i[0] * potential_chunk ]);
1145 data[1] = &(p[1]->c[ ind_1.i[1] * potential_chunk ]);
1146 data[2] = &(p[2]->c[ ind_1.i[2] * potential_chunk ]);
1147 data[3] = &(p[3]->c[ ind_1.i[3] * potential_chunk ]);
1148 data[4] = &(p[4]->c[ ind_2.i[0] * potential_chunk ]);
1149 data[5] = &(p[5]->c[ ind_2.i[1] * potential_chunk ]);
1150 data[6] = &(p[6]->c[ ind_2.i[2] * potential_chunk ]);
1151 data[7] = &(p[7]->c[ ind_2.i[3] * potential_chunk ]);
1154 mi_1.v = vec_load4(data[0][0], data[1][0], data[2][0], data[3][0]);
1155 hi_1.v = vec_load4(data[0][1], data[1][1], data[2][1], data[3][1]);
1156 mi_2.v = vec_load4(data[4][0], data[5][0], data[6][0], data[7][0]);
1157 hi_2.v = vec_load4(data[4][1], data[5][1], data[6][1], data[7][1]);
1158 x_1.v = vec_mul(vec_sub(r_1.v, mi_1.v), hi_1.v);
1159 x_2.v = vec_mul(vec_sub(r_2.v, mi_2.v), hi_2.v);
1162 eff_1.v = vec_load4(data[0][2], data[1][2], data[2][2], data[3][2]);
1163 eff_2.v = vec_load4(data[4][2], data[5][2], data[6][2], data[7][2]);
1164 c_1.v = vec_load4(data[0][3], data[1][3], data[2][3], data[3][3]);
1165 c_2.v = vec_load4(data[4][3], data[5][3], data[6][3], data[7][3]);
1166 ee_1.v = vec_madd(eff_1.v, x_1.v, c_1.v);
1167 ee_2.v = vec_madd(eff_2.v, x_2.v, c_2.v);
1168 for(j = 4 ; j < potential_chunk ; j++) {
1169 c_1.v = vec_load4(data[0][j], data[1][j], data[2][j], data[3][j]);
1170 c_2.v = vec_load4(data[4][j], data[5][j], data[6][j], data[7][j]);
1171 eff_1.v = vec_madd(eff_1.v, x_1.v, ee_1.v);
1172 eff_2.v = vec_madd(eff_2.v, x_2.v, ee_2.v);
1173 ee_1.v = vec_madd(ee_1.v, x_1.v, c_1.v);
1174 ee_2.v = vec_madd(ee_2.v, x_2.v, c_2.v);
1178 eff_1.v = vec_mul(eff_1.v, hi_1.v);
1179 eff_2.v = vec_mul(eff_2.v, hi_2.v);
1180 memcpy(&e[0], &ee_1,
sizeof(vector
float));
1181 memcpy(&f[0], &eff_1,
sizeof(vector
float));
1182 memcpy(&e[4], &ee_2,
sizeof(vector
float));
1183 memcpy(&f[4], &eff_2,
sizeof(vector
float));
1188 for(k = 0 ; k < 8 ; k++) {
1190 e[k] = ee; f[k] = eff;
1222 #if defined(__SSE2__) && defined(FPTYPE_DOUBLE)
1227 } alpha0, alpha1, alpha2, rind, mi, hi, x, ee, eff, c, r;
1231 r.v = _mm_sqrt_pd(_mm_load_pd(r2));
1234 alpha0.v = _mm_setr_pd(p[0]->alpha[0], p[1]->alpha[0]);
1235 alpha1.v = _mm_setr_pd(p[0]->alpha[1], p[1]->alpha[1]);
1236 alpha2.v = _mm_setr_pd(p[0]->alpha[2], p[1]->alpha[2]);
1237 rind.v = _mm_max_pd(_mm_setzero_pd(), _mm_add_pd(alpha0.v, _mm_mul_pd(r.v, _mm_add_pd(alpha1.v, _mm_mul_pd(r.v, alpha2.v)))));
1242 data[0] = &(p[0]->
c[ ind[0] * potential_chunk ]);
1243 data[1] = &(p[1]->
c[ ind[1] * potential_chunk ]);
1246 mi.v = _mm_setr_pd(data[0][0], data[1][0]);
1247 hi.v = _mm_setr_pd(data[0][1], data[1][1]);
1248 x.v = _mm_mul_pd(_mm_sub_pd(r.v, mi.v), hi.v);
1251 eff.v = _mm_setr_pd(data[0][2], data[1][2]);
1252 c.v = _mm_setr_pd(data[0][3], data[1][3]);
1253 ee.v = _mm_add_pd(_mm_mul_pd(eff.v, x.v), c.v);
1254 for(j = 4 ; j < potential_chunk ; j++) {
1255 c.v = _mm_setr_pd(data[0][j], data[1][j]);
1256 eff.v = _mm_add_pd(_mm_mul_pd(eff.v, x.v), ee.v);
1257 ee.v = _mm_add_pd(_mm_mul_pd(ee.v, x.v), c.v);
1261 _mm_store_pd(e, ee.v);
1262 _mm_store_pd(f, _mm_mul_pd(eff.v, _mm_div_pd(hi.v, r.v)));
1266 for(k = 0 ; k < 2 ; k++)
1298 #if defined(__AVX__) && defined(FPTYPE_DOUBLE)
1303 } alpha0, alpha1, alpha2, rind, mi, hi, x, ee, eff, c, r;
1307 r.v = _mm256_sqrt_pd(_mm256_load_pd(r2));
1310 alpha0.v = _mm256_setr_pd(p[0]->alpha[0], p[1]->alpha[0], p[2]->alpha[0], p[3]->alpha[0]);
1311 alpha1.v = _mm256_setr_pd(p[0]->alpha[1], p[1]->alpha[1], p[2]->alpha[1], p[3]->alpha[1]);
1312 alpha2.v = _mm256_setr_pd(p[0]->alpha[2], p[1]->alpha[2], p[2]->alpha[2], p[3]->alpha[2]);
1313 rind.v = _mm256_max_pd(_mm256_setzero_pd(), _mm256_add_pd(alpha0.v, _mm256_mul_pd(r.v, _mm256_add_pd(alpha1.v, _mm256_mul_pd(r.v, alpha2.v)))));
1320 data[0] = &(p[0]->
c[ ind[0] * potential_chunk ]);
1321 data[1] = &(p[1]->
c[ ind[1] * potential_chunk ]);
1322 data[2] = &(p[2]->
c[ ind[2] * potential_chunk ]);
1323 data[3] = &(p[3]->
c[ ind[3] * potential_chunk ]);
1326 mi.v = _mm256_setr_pd(data[0][0], data[1][0], data[2][0], data[3][0]);
1327 hi.v = _mm256_setr_pd(data[0][1], data[1][1], data[2][1], data[3][1]);
1328 x.v = _mm256_mul_pd(_mm256_sub_pd(r.v, mi.v), hi.v);
1331 eff.v = _mm256_setr_pd(data[0][2], data[1][2], data[2][2], data[3][2]);
1332 c.v = _mm256_setr_pd(data[0][3], data[1][3], data[2][3], data[3][3]);
1333 ee.v = _mm256_add_pd(_mm256_mul_pd(eff.v, x.v), c.v);
1334 for(j = 4 ; j < potential_chunk ; j++) {
1335 c.v = _mm256_setr_pd(data[0][j], data[1][j], data[2][j], data[3][j]);
1336 eff.v = _mm256_add_pd(_mm256_mul_pd(eff.v, x.v), ee.v);
1337 ee.v = _mm256_add_pd(_mm256_mul_pd(ee.v, x.v), c.v);
1341 _mm256_store_pd(e, ee.v);
1342 _mm256_store_pd(f, _mm256_mul_pd(eff.v, _mm256_div_pd(hi.v, r.v)));
1344 #elif defined(__SSE2__) && defined(FPTYPE_DOUBLE)
1349 } alpha0_1, alpha1_1, alpha2_1, rind_1, mi_1, hi_1, x_1, ee_1, eff_1, c_1, r_1,
1350 alpha0_2, alpha1_2, alpha2_2, rind_2, mi_2, hi_2, x_2, ee_2, eff_2, c_2, r_2;
1354 r_1.v = _mm_sqrt_pd(_mm_load_pd(&r2[0]));
1355 r_2.v = _mm_sqrt_pd(_mm_load_pd(&r2[2]));
1358 alpha0_1.v = _mm_setr_pd(p[0]->alpha[0], p[1]->alpha[0]);
1359 alpha1_1.v = _mm_setr_pd(p[0]->alpha[1], p[1]->alpha[1]);
1360 alpha2_1.v = _mm_setr_pd(p[0]->alpha[2], p[1]->alpha[2]);
1361 alpha0_2.v = _mm_setr_pd(p[2]->alpha[0], p[3]->alpha[0]);
1362 alpha1_2.v = _mm_setr_pd(p[2]->alpha[1], p[3]->alpha[1]);
1363 alpha2_2.v = _mm_setr_pd(p[2]->alpha[2], p[3]->alpha[2]);
1364 rind_1.v = _mm_max_pd(_mm_setzero_pd(), _mm_add_pd(alpha0_1.v, _mm_mul_pd(r_1.v, _mm_add_pd(alpha1_1.v, _mm_mul_pd(r_1.v, alpha2_1.v)))));
1365 rind_2.v = _mm_max_pd(_mm_setzero_pd(), _mm_add_pd(alpha0_2.v, _mm_mul_pd(r_2.v, _mm_add_pd(alpha1_2.v, _mm_mul_pd(r_2.v, alpha2_2.v)))));
1366 ind[0] = rind_1.f[0];
1367 ind[1] = rind_1.f[1];
1368 ind[2] = rind_2.f[0];
1369 ind[3] = rind_2.f[1];
1372 data[0] = &(p[0]->
c[ ind[0] * potential_chunk ]);
1373 data[1] = &(p[1]->
c[ ind[1] * potential_chunk ]);
1374 data[2] = &(p[2]->
c[ ind[2] * potential_chunk ]);
1375 data[3] = &(p[3]->
c[ ind[3] * potential_chunk ]);
1378 mi_1.v = _mm_setr_pd(data[0][0], data[1][0]);
1379 hi_1.v = _mm_setr_pd(data[0][1], data[1][1]);
1380 mi_2.v = _mm_setr_pd(data[2][0], data[3][0]);
1381 hi_2.v = _mm_setr_pd(data[2][1], data[3][1]);
1382 x_1.v = _mm_mul_pd(_mm_sub_pd(r_1.v, mi_1.v), hi_1.v);
1383 x_2.v = _mm_mul_pd(_mm_sub_pd(r_2.v, mi_2.v), hi_2.v);
1386 eff_1.v = _mm_setr_pd(data[0][2], data[1][2]);
1387 eff_2.v = _mm_setr_pd(data[2][2], data[3][2]);
1388 c_1.v = _mm_setr_pd(data[0][3], data[1][3]);
1389 c_2.v = _mm_setr_pd(data[2][3], data[3][3]);
1390 ee_1.v = _mm_add_pd(_mm_mul_pd(eff_1.v, x_1.v), c_1.v);
1391 ee_2.v = _mm_add_pd(_mm_mul_pd(eff_2.v, x_2.v), c_2.v);
1392 for(j = 4 ; j < potential_chunk ; j++) {
1393 c_1.v = _mm_setr_pd(data[0][j], data[1][j]);
1394 c_2.v = _mm_setr_pd(data[2][j], data[3][j]);
1395 eff_1.v = _mm_add_pd(_mm_mul_pd(eff_1.v, x_1.v), ee_1.v);
1396 eff_2.v = _mm_add_pd(_mm_mul_pd(eff_2.v, x_2.v), ee_2.v);
1397 ee_1.v = _mm_add_pd(_mm_mul_pd(ee_1.v, x_1.v), c_1.v);
1398 ee_2.v = _mm_add_pd(_mm_mul_pd(ee_2.v, x_2.v), c_2.v);
1402 _mm_store_pd(&e[0], ee_1.v);
1403 _mm_store_pd(&f[0], _mm_mul_pd(eff_1.v, _mm_div_pd(hi_1.v, r_1.v)));
1404 _mm_store_pd(&e[2], ee_2.v);
1405 _mm_store_pd(&f[2], _mm_mul_pd(eff_2.v, _mm_div_pd(hi_2.v, r_2.v)));
1409 for(k = 0 ; k < 4 ; k++)
1438 #if defined(__AVX__) && defined(FPTYPE_DOUBLE)
1443 } alpha0, alpha1, alpha2, rind, mi, hi, x, ee, eff, c, r;
1447 r.v = _mm256_load_pd(r_in);
1450 alpha0.v = _mm256_setr_pd(p[0]->alpha[0], p[1]->alpha[0], p[2]->alpha[0], p[3]->alpha[0]);
1451 alpha1.v = _mm256_setr_pd(p[0]->alpha[1], p[1]->alpha[1], p[2]->alpha[1], p[3]->alpha[1]);
1452 alpha2.v = _mm256_setr_pd(p[0]->alpha[2], p[1]->alpha[2], p[2]->alpha[2], p[3]->alpha[2]);
1453 rind.v = _mm256_max_pd(_mm256_setzero_pd(), _mm256_add_pd(alpha0.v, _mm256_mul_pd(r.v, _mm256_add_pd(alpha1.v, _mm256_mul_pd(r.v, alpha2.v)))));
1460 data[0] = &(p[0]->
c[ ind[0] * potential_chunk ]);
1461 data[1] = &(p[1]->
c[ ind[1] * potential_chunk ]);
1462 data[2] = &(p[2]->
c[ ind[2] * potential_chunk ]);
1463 data[3] = &(p[3]->
c[ ind[3] * potential_chunk ]);
1466 mi.v = _mm256_setr_pd(data[0][0], data[1][0], data[2][0], data[3][0]);
1467 hi.v = _mm256_setr_pd(data[0][1], data[1][1], data[2][1], data[3][1]);
1468 x.v = _mm256_mul_pd(_mm256_sub_pd(r.v, mi.v), hi.v);
1471 eff.v = _mm256_setr_pd(data[0][2], data[1][2], data[2][2], data[3][2]);
1472 c.v = _mm256_setr_pd(data[0][3], data[1][3], data[2][3], data[3][3]);
1473 ee.v = _mm256_add_pd(_mm256_mul_pd(eff.v, x.v), c.v);
1474 for(j = 4 ; j < potential_chunk ; j++) {
1475 c.v = _mm256_setr_pd(data[0][j], data[1][j], data[2][j], data[3][j]);
1476 eff.v = _mm256_add_pd(_mm256_mul_pd(eff.v, x.v), ee.v);
1477 ee.v = _mm256_add_pd(_mm256_mul_pd(ee.v, x.v), c.v);
1481 _mm256_store_pd(e, ee.v);
1482 _mm256_store_pd(f, _mm256_mul_pd(eff.v, hi.v));
1484 #elif defined(__SSE2__) && defined(FPTYPE_DOUBLE)
1489 } alpha0_1, alpha1_1, alpha2_1, rind_1, mi_1, hi_1, x_1, ee_1, eff_1, c_1, r_1,
1490 alpha0_2, alpha1_2, alpha2_2, rind_2, mi_2, hi_2, x_2, ee_2, eff_2, c_2, r_2;
1494 r_1.v = _mm_load_pd(&r_in[0]);
1495 r_2.v = _mm_load_pd(&r_in[2]);
1498 alpha0_1.v = _mm_setr_pd(p[0]->alpha[0], p[1]->alpha[0]);
1499 alpha1_1.v = _mm_setr_pd(p[0]->alpha[1], p[1]->alpha[1]);
1500 alpha2_1.v = _mm_setr_pd(p[0]->alpha[2], p[1]->alpha[2]);
1501 alpha0_2.v = _mm_setr_pd(p[2]->alpha[0], p[3]->alpha[0]);
1502 alpha1_2.v = _mm_setr_pd(p[2]->alpha[1], p[3]->alpha[1]);
1503 alpha2_2.v = _mm_setr_pd(p[2]->alpha[2], p[3]->alpha[2]);
1504 rind_1.v = _mm_max_pd(_mm_setzero_pd(), _mm_add_pd(alpha0_1.v, _mm_mul_pd(r_1.v, _mm_add_pd(alpha1_1.v, _mm_mul_pd(r_1.v, alpha2_1.v)))));
1505 rind_2.v = _mm_max_pd(_mm_setzero_pd(), _mm_add_pd(alpha0_2.v, _mm_mul_pd(r_2.v, _mm_add_pd(alpha1_2.v, _mm_mul_pd(r_2.v, alpha2_2.v)))));
1506 ind[0] = rind_1.f[0];
1507 ind[1] = rind_1.f[1];
1508 ind[2] = rind_2.f[0];
1509 ind[3] = rind_2.f[1];
1512 data[0] = &(p[0]->
c[ ind[0] * potential_chunk ]);
1513 data[1] = &(p[1]->
c[ ind[1] * potential_chunk ]);
1514 data[2] = &(p[2]->
c[ ind[2] * potential_chunk ]);
1515 data[3] = &(p[3]->
c[ ind[3] * potential_chunk ]);
1518 mi_1.v = _mm_setr_pd(data[0][0], data[1][0]);
1519 hi_1.v = _mm_setr_pd(data[0][1], data[1][1]);
1520 mi_2.v = _mm_setr_pd(data[2][0], data[3][0]);
1521 hi_2.v = _mm_setr_pd(data[2][1], data[3][1]);
1522 x_1.v = _mm_mul_pd(_mm_sub_pd(r_1.v, mi_1.v), hi_1.v);
1523 x_2.v = _mm_mul_pd(_mm_sub_pd(r_2.v, mi_2.v), hi_2.v);
1526 eff_1.v = _mm_setr_pd(data[0][2], data[1][2]);
1527 eff_2.v = _mm_setr_pd(data[2][2], data[3][2]);
1528 c_1.v = _mm_setr_pd(data[0][3], data[1][3]);
1529 c_2.v = _mm_setr_pd(data[2][3], data[3][3]);
1530 ee_1.v = _mm_add_pd(_mm_mul_pd(eff_1.v, x_1.v), c_1.v);
1531 ee_2.v = _mm_add_pd(_mm_mul_pd(eff_2.v, x_2.v), c_2.v);
1532 for(j = 4 ; j < potential_chunk ; j++) {
1533 c_1.v = _mm_setr_pd(data[0][j], data[1][j]);
1534 c_2.v = _mm_setr_pd(data[2][j], data[3][j]);
1535 eff_1.v = _mm_add_pd(_mm_mul_pd(eff_1.v, x_1.v), ee_1.v);
1536 eff_2.v = _mm_add_pd(_mm_mul_pd(eff_2.v, x_2.v), ee_2.v);
1537 ee_1.v = _mm_add_pd(_mm_mul_pd(ee_1.v, x_1.v), c_1.v);
1538 ee_2.v = _mm_add_pd(_mm_mul_pd(ee_2.v, x_2.v), c_2.v);
1542 _mm_store_pd(&e[0], ee_1.v);
1543 _mm_store_pd(&f[0], _mm_mul_pd(eff_1.v, hi_1.v));
1544 _mm_store_pd(&e[2], ee_2.v);
1545 _mm_store_pd(&f[2], _mm_mul_pd(eff_2.v, hi_2.v));
1549 for(k = 0 ; k < 4 ; k++)
1555 TF_ALWAYS_INLINE Potential *get_potential(
const Particle *a,
const Particle *b) {
1557 if ((a->flags & b->flags & PARTICLE_BOUND) && (a->clusterId == b->clusterId)) {
1558 return _Engine.p_cluster[index];
Include Python header, disable linking to pythonX_d.lib on Windows in debug mode.
Definition tfAngleConfig.h:26
TF_ALWAYS_INLINE void potential_eval_vec_4single_r(struct Potential *p[4], float *r_in, float *e, float *f)
Evaluates the given potential at a set of points (interpolated).
Definition tf_potential_eval.h:686
TF_ALWAYS_INLINE FPTYPE fptype_r2(FPTYPE *x1, FPTYPE *x2, FPTYPE *dx)
Inlined function to compute the distance^2 between two vectors.
Definition tf_fptype.h:87
TF_ALWAYS_INLINE void potential_eval_expl(struct Potential *p, FPTYPE r2, FPTYPE *e, FPTYPE *f)
Evaluates the given potential at the given radius explicitly.
Definition tf_potential_eval.h:296
FPTYPE space_cell_gaussian(int cell_id)
struct TissueForge::space_cell space_cell
the space_cell structure
TF_ALWAYS_INLINE void potential_eval_vec_2double(struct Potential *p[2], FPTYPE *r2, FPTYPE *e, FPTYPE *f)
Evaluates the given potential at a set of points (interpolated).
Definition tf_potential_eval.h:1220
TF_ALWAYS_INLINE void potential_eval(struct Potential *p, FPTYPE r2, FPTYPE *e, FPTYPE *f)
Evaluates the given potential at the given point (interpolated).
Definition tf_potential_eval.h:74
TF_ALWAYS_INLINE void potential_eval_r(struct Potential *p, FPTYPE r, FPTYPE *e, FPTYPE *f)
Evaluates the given potential at the given point (interpolated).
Definition tf_potential_eval.h:165
TF_ALWAYS_INLINE void potential_eval_vec_4double(struct Potential *p[4], FPTYPE *r2, FPTYPE *e, FPTYPE *f)
Evaluates the given potential at a set of points (interpolated).
Definition tf_potential_eval.h:1296
TF_ALWAYS_INLINE void potential_eval_vec_4double_r(struct Potential *p[4], FPTYPE *r_in, FPTYPE *e, FPTYPE *f)
Evaluates the given potential at a set of points (interpolated).
Definition tf_potential_eval.h:1436
@ POTENTIAL_SHIFTED
Definition tfPotential.h:92
@ POTENTIAL_SCALED
Definition tfPotential.h:87
@ POTENTIAL_PERIODIC
Definition tfPotential.h:104
TF_ALWAYS_INLINE void potential_eval_vec_4single(struct Potential *p[4], float *r2, float *e, float *f)
Evaluates the given potential at a set of points (interpolated).
Definition tf_potential_eval.h:367
TF_ALWAYS_INLINE void potential_eval_vec_4single_old(struct Potential *p[4], float *r2, float *e, float *f)
Evaluates the given potential at a set of points (interpolated).
Definition tf_potential_eval.h:506
TF_ALWAYS_INLINE void potential_eval_vec_8single(struct Potential *p[8], float *r2, float *e, float *f)
Evaluates the given potential at a set of points (interpolated).
Definition tf_potential_eval.h:809
A Potential object is a compiled interpolation of a given function. The Universe applies potentials t...
Definition tfPotential.h:213
uint32_t flags
Definition tfPotential.h:217
FPTYPE * c
Definition tfPotential.h:224
FPTYPE alpha[4]
Definition tfPotential.h:221
int n
Definition tfPotential.h:238
static const int max_type
Definition tfEngine.h:193
struct Potential ** p
Definition tfEngine.h:200