1 |
/* $Id:$ */ |
2 |
/******************************************************* |
3 |
* |
4 |
* Copyright 2008 by University of Queensland |
5 |
* |
6 |
* http://esscc.uq.edu.au |
7 |
* Primary Business: Queensland, Australia |
8 |
* Licensed under the Open Software License version 3.0 |
9 |
* http://www.opensource.org/licenses/osl-3.0.php |
10 |
* |
11 |
*******************************************************/ |
12 |
|
13 |
#include "Util.h" |
14 |
|
15 |
/* |
16 |
* set x to zero: |
17 |
*/ |
18 |
void Paso_zeroes(const dim_t n, double* x) |
19 |
{ |
20 |
dim_t i,local_n,rest,n_start,n_end,q; |
21 |
#ifdef _OPENMP |
22 |
const int num_threads=omp_get_num_threads(); |
23 |
#else |
24 |
const int num_threads=1; |
25 |
#endif |
26 |
|
27 |
#pragma omp parallel for private(i,local_n,rest,n_start,n_end,q) |
28 |
for (i=0;i<num_threads;++i) { |
29 |
local_n=n/num_threads; |
30 |
rest=n-local_n*num_threads; |
31 |
n_start=local_n*i+MIN(i,rest); |
32 |
n_end=local_n*(i+1)+MIN(i+1,rest); |
33 |
#pragma ivdep |
34 |
for (q=n_start;q<n_end;++q) { |
35 |
x[q]=0; |
36 |
} |
37 |
} |
38 |
|
39 |
} |
40 |
/* |
41 |
* performs an update of the form x=a*x+b*y where y and x are long vectors. |
42 |
* if b=0, y is not used. |
43 |
*/ |
44 |
void Paso_Update(const dim_t n, const double a, double* x, const double b, const double* y) |
45 |
{ |
46 |
dim_t i,local_n,rest,n_start,n_end,q; |
47 |
#ifdef _OPENMP |
48 |
const int num_threads=omp_get_num_threads(); |
49 |
#else |
50 |
const int num_threads=1; |
51 |
#endif |
52 |
|
53 |
#pragma omp parallel for private(i,local_n,rest,n_start,n_end,q) |
54 |
for (i=0;i<num_threads;++i) { |
55 |
local_n=n/num_threads; |
56 |
rest=n-local_n*num_threads; |
57 |
n_start=local_n*i+MIN(i,rest); |
58 |
n_end=local_n*(i+1)+MIN(i+1,rest); |
59 |
if (a==1.) { |
60 |
#pragma ivdep |
61 |
for (q=n_start;q<n_end;++q) { |
62 |
x[q]+=b*y[q]; |
63 |
} |
64 |
} else if ((a==1.) && (b==1)) { |
65 |
#pragma ivdep |
66 |
for (q=n_start;q<n_end;++q) { |
67 |
x[q]+=y[q]; |
68 |
} |
69 |
} else if ((ABS(a)==0) && (ABS(b)==0)) { |
70 |
#pragma ivdep |
71 |
for (q=n_start;q<n_end;++q) { |
72 |
x[q]=0.; |
73 |
} |
74 |
} else if ((ABS(a)==0) && (ABS(b)==1)) { |
75 |
#pragma ivdep |
76 |
for (q=n_start;q<n_end;++q) { |
77 |
x[q]=y[q]; |
78 |
} |
79 |
} else if (ABS(a)==0) { |
80 |
#pragma ivdep |
81 |
for (q=n_start;q<n_end;++q) { |
82 |
x[q]=b*y[q]; |
83 |
} |
84 |
} else { |
85 |
#pragma ivdep |
86 |
for (q=n_start;q<n_end;++q) { |
87 |
x[q]=a*x[q]+b*y[q]; |
88 |
} |
89 |
} |
90 |
} |
91 |
} |
92 |
/* |
93 |
* performs an update of the form z=a*x+b*y where y and x are long vectors. |
94 |
* if a=0, x is not used. |
95 |
* if b=0, y is not used. |
96 |
*/ |
97 |
void Paso_LinearCombination(const dim_t n, double*z, const double a,const double* x, const double b, const double* y) |
98 |
{ |
99 |
dim_t i,local_n,rest,n_start,n_end,q; |
100 |
#ifdef _OPENMP |
101 |
const int num_threads=omp_get_num_threads(); |
102 |
#else |
103 |
const int num_threads=1; |
104 |
#endif |
105 |
|
106 |
#pragma omp parallel for private(i,local_n,rest,n_start,n_end,q) |
107 |
for (i=0;i<num_threads;++i) { |
108 |
local_n=n/num_threads; |
109 |
rest=n-local_n*num_threads; |
110 |
n_start=local_n*i+MIN(i,rest); |
111 |
n_end=local_n*(i+1)+MIN(i+1,rest); |
112 |
if ((ABS(a)==0) && (ABS(b)==0)) { |
113 |
#pragma ivdep |
114 |
for (q=n_start;q<n_end;++q) { |
115 |
z[q]=0.; |
116 |
} |
117 |
} else if ((ABS(a)==0) && (ABS(b)>0.)) { |
118 |
#pragma ivdep |
119 |
for (q=n_start;q<n_end;++q) { |
120 |
z[q]=b*y[q]; |
121 |
} |
122 |
} else if ((ABS(a)>0) && (ABS(b)==0.)) { |
123 |
#pragma ivdep |
124 |
for (q=n_start;q<n_end;++q) { |
125 |
z[q]=a*x[q]; |
126 |
} |
127 |
} else { |
128 |
#pragma ivdep |
129 |
for (q=n_start;q<n_end;++q) { |
130 |
z[q]=a*x[q]+b*y[q]; |
131 |
} |
132 |
} |
133 |
} |
134 |
} |
135 |
|
136 |
|
137 |
|
138 |
double Paso_InnerProduct(const dim_t n,const double* x, const double* y, Paso_MPIInfo* mpiinfo) |
139 |
{ |
140 |
dim_t i,local_n,rest,n_start,n_end,q; |
141 |
double my_out=0, local_out=0., out=0.; |
142 |
#ifdef _OPENMP |
143 |
const int num_threads=omp_get_num_threads(); |
144 |
#else |
145 |
const int num_threads=1; |
146 |
#endif |
147 |
#pragma omp parallel for private(i,local_n,rest,n_start,n_end,q) reduction(+:my_out) |
148 |
for (i=0;i<num_threads;++i) { |
149 |
local_n=n/num_threads; |
150 |
rest=n-local_n*num_threads; |
151 |
n_start=local_n*i+MIN(i,rest); |
152 |
n_end=local_n*(i+1)+MIN(i+1,rest); |
153 |
for (q=n_start;q<n_end;++q) local_out+=x[q]*y[q]; |
154 |
my_out+=local_out; |
155 |
} |
156 |
#ifdef PASO_MPI |
157 |
#pragma omp master |
158 |
{ |
159 |
MPI_Allreduce(&my_out,&out, 1, MPI_DOUBLE, MPI_SUM, mpiinfo->comm); |
160 |
} |
161 |
#else |
162 |
out=my_out; |
163 |
#endif |
164 |
|
165 |
return out; |
166 |
|
167 |
} |
168 |
double Paso_l2(const dim_t n, const double* x, Paso_MPIInfo* mpiinfo) |
169 |
{ |
170 |
dim_t i,local_n,rest,n_start,n_end,q; |
171 |
double my_out=0, local_out=0., out=0.; |
172 |
#ifdef _OPENMP |
173 |
const int num_threads=omp_get_num_threads(); |
174 |
#else |
175 |
const int num_threads=1; |
176 |
#endif |
177 |
#pragma omp parallel for private(i,local_n,rest,n_start,n_end,q) reduction(+:my_out) |
178 |
for (i=0;i<num_threads;++i) { |
179 |
local_n=n/num_threads; |
180 |
rest=n-local_n*num_threads; |
181 |
n_start=local_n*i+MIN(i,rest); |
182 |
n_end=local_n*(i+1)+MIN(i+1,rest); |
183 |
for (q=n_start;q<n_end;++q) local_out+=x[q]*x[q]; |
184 |
my_out+=local_out; |
185 |
} |
186 |
#ifdef PASO_MPI |
187 |
#pragma omp master |
188 |
{ |
189 |
MPI_Allreduce(&my_out,&out, 1, MPI_DOUBLE, MPI_SUM, mpiinfo->comm); |
190 |
} |
191 |
#else |
192 |
out=my_out; |
193 |
#endif |
194 |
|
195 |
return sqrt(out); |
196 |
|
197 |
} |
198 |
/* |
199 |
* Apply a sequence of n-1 Givens rotations (c,s) to v of length n which assumed to be small. |
200 |
* |
201 |
*/ |
202 |
void ApplyGivensRotations(const dim_t n,double* v,const double* c,const double* s) |
203 |
{ |
204 |
dim_t i; |
205 |
register double w1,w2; |
206 |
#pragma ivdep |
207 |
for (i=0; i<n-1;++i) { |
208 |
w1=c[i]*v[i]-s[i]*v[i+1]; |
209 |
w2=s[i]*v[i]+c[i]*v[i+1]; |
210 |
v[i]=w1; |
211 |
v[i+1]=w2; |
212 |
} |
213 |
} |