1 |
/* $Id$ */ |
2 |
|
3 |
/******************************************************* |
4 |
* |
5 |
* Copyright 2003-2007 by ACceSS MNRF |
6 |
* Copyright 2007 by University of Queensland |
7 |
* |
8 |
* http://esscc.uq.edu.au |
9 |
* Primary Business: Queensland, Australia |
10 |
* Licensed under the Open Software License version 3.0 |
11 |
* http://www.opensource.org/licenses/osl-3.0.php |
12 |
* |
13 |
*******************************************************/ |
14 |
|
15 |
/**************************************************************/ |
16 |
|
17 |
/* Some utility routines: */ |
18 |
|
19 |
/**************************************************************/ |
20 |
|
21 |
/* Copyrights by ACcESS Australia, 2003,2004,2005 */ |
22 |
|
23 |
/**************************************************************/ |
24 |
|
25 |
#include "Common.h" |
26 |
#include "PasoUtil.h" |
27 |
#ifdef _OPENMP |
28 |
#include <omp.h> |
29 |
#endif |
30 |
|
31 |
/* returns true if array contains value */ |
32 |
bool_t Paso_Util_isAny(dim_t N,index_t* array,index_t value) { |
33 |
bool_t out=FALSE; |
34 |
dim_t i; |
35 |
#pragma omp parallel for private(i) schedule(static) reduction(||:out) |
36 |
for (i=0;i<N;i++) { |
37 |
out = out || (array[i]==value); |
38 |
} |
39 |
return out; |
40 |
} |
41 |
|
42 |
/**************************************************************/ |
43 |
|
44 |
/* calculates the cummultative sum in array and returns the total sum */ |
45 |
|
46 |
/**************************************************************/ |
47 |
index_t Paso_Util_cumsum(dim_t N,index_t* array) { |
48 |
index_t out=0,tmp; |
49 |
dim_t i; |
50 |
#ifdef _OPENMP |
51 |
index_t *partial_sums=NULL,sum; |
52 |
partial_sums=TMPMEMALLOC(omp_get_max_threads(),index_t); |
53 |
#pragma omp parallel private(sum,i,tmp) |
54 |
{ |
55 |
sum=0; |
56 |
#pragma omp for schedule(static) |
57 |
for (i=0;i<N;++i) sum+=array[i]; |
58 |
|
59 |
partial_sums[omp_get_thread_num()]=sum; |
60 |
#pragma omp barrier |
61 |
#pragma omp single |
62 |
{ |
63 |
out=0; |
64 |
for (i=0;i<omp_get_max_threads();++i) { |
65 |
tmp=out; |
66 |
out+=partial_sums[i]; |
67 |
partial_sums[i]=tmp; |
68 |
} |
69 |
} |
70 |
sum=partial_sums[omp_get_thread_num()]; |
71 |
#pragma omp for schedule(static) |
72 |
for (i=0;i<N;++i) { |
73 |
tmp=sum; |
74 |
sum+=array[i]; |
75 |
array[i]=tmp; |
76 |
} |
77 |
} |
78 |
TMPMEMFREE(partial_sums); |
79 |
#else |
80 |
for (i=0;i<N;++i) { |
81 |
tmp=out; |
82 |
out+=array[i]; |
83 |
array[i]=tmp; |
84 |
} |
85 |
#endif |
86 |
return out; |
87 |
} |
88 |
/* |
89 |
* set x to zero: |
90 |
*/ |
91 |
void Paso_zeroes(const dim_t n, double* x) |
92 |
{ |
93 |
dim_t i,local_n,rest,n_start,n_end,q; |
94 |
#ifdef _OPENMP |
95 |
const int num_threads=omp_get_max_threads(); |
96 |
#else |
97 |
const int num_threads=1; |
98 |
#endif |
99 |
|
100 |
#pragma omp parallel for private(i,local_n,rest,n_start,n_end,q) |
101 |
for (i=0;i<num_threads;++i) { |
102 |
local_n=n/num_threads; |
103 |
rest=n-local_n*num_threads; |
104 |
n_start=local_n*i+MIN(i,rest); |
105 |
n_end=local_n*(i+1)+MIN(i+1,rest); |
106 |
#pragma ivdep |
107 |
for (q=n_start;q<n_end;++q) x[q]=0; |
108 |
} |
109 |
|
110 |
} |
111 |
/* |
112 |
* performs an update of the form x=a*x+b*y where y and x are long vectors. |
113 |
* if b=0, y is not used. |
114 |
*/ |
115 |
void Paso_Update(const dim_t n, const double a, double* x, const double b, const double* y) |
116 |
{ |
117 |
dim_t i,local_n,rest,n_start,n_end,q; |
118 |
#ifdef _OPENMP |
119 |
const int num_threads=omp_get_max_threads(); |
120 |
#else |
121 |
const int num_threads=1; |
122 |
#endif |
123 |
|
124 |
#pragma omp parallel for private(i,local_n,rest,n_start,n_end,q) |
125 |
for (i=0;i<num_threads;++i) { |
126 |
local_n=n/num_threads; |
127 |
rest=n-local_n*num_threads; |
128 |
n_start=local_n*i+MIN(i,rest); |
129 |
n_end=local_n*(i+1)+MIN(i+1,rest); |
130 |
if (a==1.) { |
131 |
#pragma ivdep |
132 |
for (q=n_start;q<n_end;++q) { |
133 |
x[q]+=b*y[q]; |
134 |
} |
135 |
} else if ((a==1.) && (b==1)) { |
136 |
#pragma ivdep |
137 |
for (q=n_start;q<n_end;++q) { |
138 |
x[q]+=y[q]; |
139 |
} |
140 |
} else if ((ABS(a)==0) && (ABS(b)==0)) { |
141 |
#pragma ivdep |
142 |
for (q=n_start;q<n_end;++q) { |
143 |
x[q]=0.; |
144 |
} |
145 |
} else if ((ABS(a)==0) && (ABS(b)==1)) { |
146 |
#pragma ivdep |
147 |
for (q=n_start;q<n_end;++q) { |
148 |
x[q]=y[q]; |
149 |
} |
150 |
} else if (ABS(a)==0) { |
151 |
#pragma ivdep |
152 |
for (q=n_start;q<n_end;++q) { |
153 |
x[q]=b*y[q]; |
154 |
} |
155 |
} else { |
156 |
#pragma ivdep |
157 |
for (q=n_start;q<n_end;++q) { |
158 |
x[q]=a*x[q]+b*y[q]; |
159 |
} |
160 |
} |
161 |
} |
162 |
} |
163 |
void Paso_Copy(const dim_t n, double* out, const double* in) { |
164 |
Paso_LinearCombination(n,out, 1.,in,0., in); |
165 |
} |
166 |
/* |
167 |
* performs an update of the form z=a*x+b*y where y and x are long vectors. |
168 |
* if a=0, x is not used. |
169 |
* if b=0, y is not used. |
170 |
*/ |
171 |
void Paso_LinearCombination(const dim_t n, double*z, const double a,const double* x, const double b, const double* y) |
172 |
{ |
173 |
dim_t i,local_n,rest,n_start,n_end,q; |
174 |
#ifdef _OPENMP |
175 |
const int num_threads=omp_get_max_threads(); |
176 |
#else |
177 |
const int num_threads=1; |
178 |
#endif |
179 |
|
180 |
#pragma omp parallel for private(i,local_n,rest,n_start,n_end,q) |
181 |
for (i=0;i<num_threads;++i) { |
182 |
local_n=n/num_threads; |
183 |
rest=n-local_n*num_threads; |
184 |
n_start=local_n*i+MIN(i,rest); |
185 |
n_end=local_n*(i+1)+MIN(i+1,rest); |
186 |
if ((ABS(a)==0) && (ABS(b)==0)) { |
187 |
#pragma ivdep |
188 |
for (q=n_start;q<n_end;++q) { |
189 |
z[q]=0.; |
190 |
} |
191 |
} else if ((ABS(a)==0) && (ABS(b)>0.)) { |
192 |
#pragma ivdep |
193 |
for (q=n_start;q<n_end;++q) { |
194 |
z[q]=b*y[q]; |
195 |
} |
196 |
} else if ((ABS(a)>0) && (ABS(b)==0.)) { |
197 |
#pragma ivdep |
198 |
for (q=n_start;q<n_end;++q) { |
199 |
z[q]=a*x[q]; |
200 |
} |
201 |
} else { |
202 |
#pragma ivdep |
203 |
for (q=n_start;q<n_end;++q) { |
204 |
z[q]=a*x[q]+b*y[q]; |
205 |
} |
206 |
} |
207 |
} |
208 |
} |
209 |
|
210 |
|
211 |
|
212 |
double Paso_InnerProduct(const dim_t n,const double* x, const double* y, Paso_MPIInfo* mpiinfo) |
213 |
{ |
214 |
dim_t i,local_n,rest,n_start,n_end,q; |
215 |
double my_out=0, local_out=0., out=0.; |
216 |
#ifdef _OPENMP |
217 |
const int num_threads=omp_get_max_threads(); |
218 |
#else |
219 |
const int num_threads=1; |
220 |
#endif |
221 |
#pragma omp parallel for private(i,local_n,rest,n_start,n_end,q) reduction(+:my_out) |
222 |
for (i=0;i<num_threads;++i) { |
223 |
local_n=n/num_threads; |
224 |
rest=n-local_n*num_threads; |
225 |
n_start=local_n*i+MIN(i,rest); |
226 |
n_end=local_n*(i+1)+MIN(i+1,rest); |
227 |
for (q=n_start;q<n_end;++q) local_out+=x[q]*y[q]; |
228 |
my_out+=local_out; |
229 |
} |
230 |
#ifdef PASO_MPI |
231 |
#pragma omp single |
232 |
{ |
233 |
MPI_Allreduce(&my_out,&out, 1, MPI_DOUBLE, MPI_SUM, mpiinfo->comm); |
234 |
} |
235 |
#else |
236 |
out=my_out; |
237 |
#endif |
238 |
|
239 |
return out; |
240 |
|
241 |
} |
242 |
double Paso_l2(const dim_t n, const double* x, Paso_MPIInfo* mpiinfo) |
243 |
{ |
244 |
dim_t i,local_n,rest,n_start,n_end,q; |
245 |
double my_out=0, local_out=0., out=0.; |
246 |
#ifdef _OPENMP |
247 |
const int num_threads=omp_get_max_threads(); |
248 |
#else |
249 |
const int num_threads=1; |
250 |
#endif |
251 |
#pragma omp parallel for private(i,local_n,rest,n_start,n_end,q) reduction(+:my_out) |
252 |
for (i=0;i<num_threads;++i) { |
253 |
local_n=n/num_threads; |
254 |
rest=n-local_n*num_threads; |
255 |
n_start=local_n*i+MIN(i,rest); |
256 |
n_end=local_n*(i+1)+MIN(i+1,rest); |
257 |
for (q=n_start;q<n_end;++q) local_out+=x[q]*x[q]; |
258 |
my_out+=local_out; |
259 |
} |
260 |
#ifdef PASO_MPI |
261 |
#pragma omp single |
262 |
{ |
263 |
MPI_Allreduce(&my_out,&out, 1, MPI_DOUBLE, MPI_SUM, mpiinfo->comm); |
264 |
} |
265 |
#else |
266 |
out=my_out; |
267 |
#endif |
268 |
|
269 |
return sqrt(out); |
270 |
|
271 |
} |
272 |
/* |
273 |
* Apply a sequence of n-1 Givens rotations (c,s) to v of length n which assumed to be small. |
274 |
* |
275 |
*/ |
276 |
void ApplyGivensRotations(const dim_t n,double* v,const double* c,const double* s) |
277 |
{ |
278 |
dim_t i; |
279 |
register double w1,w2; |
280 |
#pragma ivdep |
281 |
for (i=0; i<n-1;++i) { |
282 |
w1=c[i]*v[i]-s[i]*v[i+1]; |
283 |
w2=s[i]*v[i]+c[i]*v[i+1]; |
284 |
v[i]=w1; |
285 |
v[i+1]=w2; |
286 |
} |
287 |
} |
288 |
|
289 |
bool_t Paso_fileExists( const char* filename ) |
290 |
{ |
291 |
FILE* fp = NULL; |
292 |
fp = fopen(filename,"r"); |
293 |
if( fp != NULL ) { |
294 |
close(fp); |
295 |
return TRUE; |
296 |
} else { |
297 |
return FALSE; |
298 |
} |
299 |
} |