Tissue Forge C++ 0.2.1
Interactive, particle-based physics, chemistry and biology modeling and simulation environment
Loading...
Searching...
No Matches
tf_fptype.h
1/*******************************************************************************
2 * This file is part of mdcore.
3 * Coypright (c) 2010 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
4 * Coypright (c) 2017 Andy Somogyi (somogyie at indiana dot edu)
5 * Copyright (c) 2022-2024 T.J. Sego
6 *
7 * This program is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as published
9 * by the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with this program. If not, see <http://www.gnu.org/licenses/>.
19 *
20 ******************************************************************************/
21
22#ifndef _MDCORE_INCLUDE_TF_FPTYPE_H_
23#define _MDCORE_INCLUDE_TF_FPTYPE_H_
24
25#include <mdcore_config.h>
26
27/* Define some macros for single/double precision vector operations. */
28#if defined(FPTYPE_SINGLE)
29 #if defined(__AVX__)
30 #define VEC_SINGLE
31 #define VEC_SIZE 8
32 #define VEC_ALIGN 32
33 #define VECTORIZE
34 #elif ( defined(__SSE__) || defined(__ALTIVEC__) )
35 #define VEC_SINGLE
36 #define VEC_SIZE 4
37 #define VEC_ALIGN 16
38 #define VECTORIZE
39 #endif
40#else
41 #if defined(__AVX__)
42 #define VEC_DOUBLE
43 #define VEC_SIZE 4
44 #define VEC_ALIGN 32
45 #define VECTORIZE
46 #elif defined(__SSE2__)
47 #define VEC_DOUBLE
48 #define VEC_SIZE 4
49 #define VEC_ALIGN 16
50 #define VECTORIZE
51 #endif
52#endif
53
54#undef VECTORIZE
55
56#if defined(__x86_64__) || defined(_M_X64)
57#include <emmintrin.h> //for Intel __m128i datatype
58#include <immintrin.h> //for Intel __m256i datatype
59#endif
60
61#if defined(__ARM_NEON)
62#include <arm_neon.h> //For ARM uint64x2_t datatype
63#endif
64
65
66
68#define simd_vector(elcount, type) __attribute__((vector_size((elcount)*sizeof(type)))) type
69
70
71namespace TissueForge {
72
73
87 TF_ALWAYS_INLINE FPTYPE fptype_r2 ( FPTYPE *x1 , FPTYPE *x2 , FPTYPE *dx ) {
88
89 #if defined(VECTORIZE) && defined(FPTYPE_SINGLE) && defined(__SSE4_1__)
90 union {
91 simd_vector(4,float) v;
92 float f[4];
93 } a, b, c, d;
94
95 /* Load x1 and x2 into a and b. */
96 a.v = _mm_load_ps( x1 );
97 b.v = _mm_load_ps( x2 );
98
99 /* Compute the difference and store in dx. */
100 c.v = a.v - b.v;
101 _mm_store_ps( dx , c.v );
102
103 /* Use the built-in dot-product instruction. */
104 d.v = _mm_dp_ps( c.v , c.v , 0x71 );
105
106 /* Return the sum of squares. */
107 return d.f[0];
108 #elif defined(VECTORIZE) && defined(FPTYPE_SINGLE) && defined(__SSE3__)
109 union {
110 simd_vector(4,float) v;
111 float f[4];
112 } a, b, c, d;
113
114 /* Load x1 and x2 into a and b. */
115 a.v = _mm_load_ps( x1 );
116 b.v = _mm_load_ps( x2 );
117
118 /* Compute the difference and store in dx. */
119 c.v = a.v - b.v;
120 _mm_store_ps( dx , c.v );
121
122 /* Square the entries (use a different register so that c can be stored). */
123 d.v = c.v * c.v;
124
125 /* Add horizontally twice to get the sum of the four entries
126 in the lowest float. */
127 d.v = _mm_hadd_ps( d.v , d.v );
128 d.v = _mm_hadd_ps( d.v , d.v );
129
130 /* Return the sum of squares. */
131 return d.f[0];
132 #elif defined(VECTORIZE) && defined(FPTYPE_DOUBLE) && defined(__AVX__)
133 union {
134 __m256d v;
135 double f[4];
136 } a, b, c, d;
137
138 /* Load x1 and x2 into a and b. */
139 a.v = _mm256_load_pd( x1 );
140 b.v = _mm256_load_pd( x2 );
141
142 /* Compute the difference and store in dx. */
143 c.v = a.v - b.v;
144 _mm256_store_pd( dx , c.v );
145
146 /* Square the entries (use a different register so that c can be stored). */
147 d.v = c.v * c.v;
148
149 /* Add horizontally twice to get the sum of the four entries
150 in the lowest double. */
151 d.v = _mm256_hadd_pd( d.v , d.v );
152
153 /* Return the sum of squares. */
154 return d.f[0] + d.f[2];
155 #elif defined(VECTORIZE) && defined(FPTYPE_DOUBLE) && defined(__SSE4_1__)
156 union {
157 simd_vector(2,double) v;
158 double f[2];
159 } a1, a2, b1, b2, c1, c2, d1;
160
161 /* Load x1 and x2 into a and b. */
162 a1.v = _mm_load_pd( x1 );
163 b1.v = _mm_load_pd( x2 );
164 a2.v = _mm_load_pd( &x1[2] );
165 b2.v = _mm_load_pd( &x2[2] );
166
167 /* Compute the difference and store in dx. */
168 c1.v = a1.v - b1.v;
169 c2.v = a2.v - b2.v;
170 _mm_store_pd( dx , c1.v );
171 _mm_store_pd( &dx[2] , c2.v );
172
173 /* Use the built-in dot-product instruction. */
174 d1.v = _mm_dp_pd( c1.v , c1.v , 0x31 ) + c2.v * c2.v;
175
176 /* Return the sum of squares. */
177 return d1.f[0];
178 #elif defined(VECTORIZE) && defined(FPTYPE_DOUBLE) && defined(__SSE3__)
179 union {
180 simd_vector(2,double) v;
181 double f[2];
182 } a1, a2, b1, b2, c1, c2, d1, d2;
183
184 /* Load x1 and x2 into a and b. */
185 a1.v = _mm_load_pd( x1 );
186 b1.v = _mm_load_pd( x2 );
187 a2.v = _mm_load_pd( &x1[2] );
188 b2.v = _mm_load_pd( &x2[2] );
189
190 /* Compute the difference and store in dx. */
191 c1.v = a1.v - b1.v;
192 c2.v = a2.v - b2.v;
193 _mm_store_pd( dx , c1.v );
194 _mm_store_pd( &dx[2] , c2.v );
195
196 /* Square the entries (use a different register so that c can be stored). */
197 d1.v = c1.v * c1.v;
198 d2.v = c2.v * c2.v;
199
200 /* Add horizontally twice to get the sum of the four entries
201 in the lowest double. */
202 d1.v = _mm_hadd_pd( d1.v , d2.v );
203 d1.v = _mm_hadd_pd( d1.v , d1.v );
204
205 /* Return the sum of squares. */
206 return d1.f[0];
207 #else
208 dx[0] = x1[0] - x2[0];
209 dx[1] = x1[1] - x2[1];
210 dx[2] = x1[2] - x2[2];
211 return dx[0]*dx[0] + dx[1]*dx[1] + dx[2]*dx[2];
212 #endif
213
214 }
215
216};
217
218#endif // _MDCORE_INCLUDE_TF_FPTYPE_H_
Include Python header, disable linking to pythonX_d.lib on Windows in debug mode.
Definition tfAngleConfig.h:26
TF_ALWAYS_INLINE FPTYPE fptype_r2(FPTYPE *x1, FPTYPE *x2, FPTYPE *dx)
Inlined function to compute the distance^2 between two vectors.
Definition tf_fptype.h:87