124 |
tol = *resid; |
tol = *resid; |
125 |
Performance_startMonitor(pp,PERFORMANCE_SOLVER); |
Performance_startMonitor(pp,PERFORMANCE_SOLVER); |
126 |
/* initialize data */ |
/* initialize data */ |
127 |
#pragma omp parallel (i0, istart, iend, ipp) |
#pragma omp parallel private(i0, istart, iend, ipp) |
128 |
{ |
{ |
129 |
#ifdef USE_DYNAMIC_SCHEDULING |
#ifdef USE_DYNAMIC_SCHEDULING |
130 |
#pragma omp for schedule(dynamic, 1) |
#pragma omp for schedule(dynamic, 1) |
162 |
Performance_startMonitor(pp,PERFORMANCE_SOLVER); |
Performance_startMonitor(pp,PERFORMANCE_SOLVER); |
163 |
/* tau=v*r */ |
/* tau=v*r */ |
164 |
sum_1 = 0; |
sum_1 = 0; |
165 |
#pragma omp parallel (i0, istart, iend, ipp, ss) |
#pragma omp parallel private(i0, istart, iend, ipp, ss) |
166 |
{ |
{ |
167 |
|
ss=0; |
168 |
#ifdef USE_DYNAMIC_SCHEDULING |
#ifdef USE_DYNAMIC_SCHEDULING |
169 |
#pragma omp for schedule(dynamic, 1) |
#pragma omp for schedule(dynamic, 1) |
170 |
for (ipp=0; ipp < n_chunks; ++ipp) { |
for (ipp=0; ipp < n_chunks; ++ipp) { |
176 |
istart=len*ipp+MIN(ipp,rest); |
istart=len*ipp+MIN(ipp,rest); |
177 |
iend=len*(ipp+1)+MIN(ipp+1,rest); |
iend=len*(ipp+1)+MIN(ipp+1,rest); |
178 |
#endif |
#endif |
|
ss=0; |
|
179 |
#pragma ivdep |
#pragma ivdep |
180 |
for (i0=istart;i0<iend;i0++) ss+=v[i0]*r[i0]; |
for (i0=istart;i0<iend;i0++) ss+=v[i0]*r[i0]; |
|
#pragma critical |
|
|
sum_1+=ss; |
|
181 |
#ifdef USE_DYNAMIC_SCHEDULING |
#ifdef USE_DYNAMIC_SCHEDULING |
182 |
} |
} |
183 |
#else |
#else |
184 |
} |
} |
185 |
#endif |
#endif |
186 |
|
#pragma critical |
187 |
|
{ |
188 |
|
sum_1+=ss; |
189 |
|
} |
190 |
} |
} |
191 |
#ifdef PASO_MPI |
#ifdef PASO_MPI |
192 |
/* In case we have many MPI processes, each of which may have several OMP threads: |
/* In case we have many MPI processes, each of which may have several OMP threads: |
197 |
tau_old=tau; |
tau_old=tau; |
198 |
tau=sum_1; |
tau=sum_1; |
199 |
/* p=v+beta*p */ |
/* p=v+beta*p */ |
200 |
#pragma omp parallel (i0, istart, iend, ipp,beta) |
#pragma omp parallel private(i0, istart, iend, ipp,beta) |
201 |
{ |
{ |
202 |
#ifdef USE_DYNAMIC_SCHEDULING |
#ifdef USE_DYNAMIC_SCHEDULING |
203 |
#pragma omp for schedule(dynamic, 1) |
#pragma omp for schedule(dynamic, 1) |
233 |
|
|
234 |
/* delta=p*v */ |
/* delta=p*v */ |
235 |
sum_2 = 0; |
sum_2 = 0; |
236 |
#pragma omp parallel (i0, istart, iend, ipp,ss) |
#pragma omp parallel private(i0, istart, iend, ipp,ss) |
237 |
{ |
{ |
238 |
|
ss=0; |
239 |
#ifdef USE_DYNAMIC_SCHEDULING |
#ifdef USE_DYNAMIC_SCHEDULING |
240 |
#pragma omp for schedule(dynamic, 1) |
#pragma omp for schedule(dynamic, 1) |
241 |
for (ipp=0; ipp < n_chunks; ++ipp) { |
for (ipp=0; ipp < n_chunks; ++ipp) { |
247 |
istart=len*ipp+MIN(ipp,rest); |
istart=len*ipp+MIN(ipp,rest); |
248 |
iend=len*(ipp+1)+MIN(ipp+1,rest); |
iend=len*(ipp+1)+MIN(ipp+1,rest); |
249 |
#endif |
#endif |
|
ss=0; |
|
250 |
#pragma ivdep |
#pragma ivdep |
251 |
for (i0=istart;i0<iend;i0++) ss+=v[i0]*p[i0]; |
for (i0=istart;i0<iend;i0++) ss+=v[i0]*p[i0]; |
|
#pragma critical |
|
|
sum_2+=ss; |
|
252 |
#ifdef USE_DYNAMIC_SCHEDULING |
#ifdef USE_DYNAMIC_SCHEDULING |
253 |
} |
} |
254 |
#else |
#else |
255 |
} |
} |
256 |
#endif |
#endif |
257 |
|
#pragma critical |
258 |
|
{ |
259 |
|
sum_2+=ss; |
260 |
|
} |
261 |
} |
} |
262 |
#ifdef PASO_MPI |
#ifdef PASO_MPI |
263 |
loc_sum[0] = sum_2; |
loc_sum[0] = sum_2; |
270 |
/* smoother */ |
/* smoother */ |
271 |
sum_3 = 0; |
sum_3 = 0; |
272 |
sum_4 = 0; |
sum_4 = 0; |
273 |
#pragma omp parallel (i0, istart, iend, ipp,d, ss, ss1, alpha) |
#pragma omp parallel private(i0, istart, iend, ipp,d, ss, ss1, alpha) |
274 |
{ |
{ |
275 |
|
ss=0; |
276 |
|
ss1=0; |
277 |
|
alpha=tau/delta; |
278 |
#ifdef USE_DYNAMIC_SCHEDULING |
#ifdef USE_DYNAMIC_SCHEDULING |
279 |
#pragma omp for schedule(dynamic, 1) |
#pragma omp for schedule(dynamic, 1) |
280 |
for (ipp=0; ipp < n_chunks; ++ipp) { |
for (ipp=0; ipp < n_chunks; ++ipp) { |
286 |
istart=len*ipp+MIN(ipp,rest); |
istart=len*ipp+MIN(ipp,rest); |
287 |
iend=len*(ipp+1)+MIN(ipp+1,rest); |
iend=len*(ipp+1)+MIN(ipp+1,rest); |
288 |
#endif |
#endif |
|
ss=0; |
|
|
ss1=0; |
|
|
alpha=tau/delta; |
|
289 |
#pragma ivdep |
#pragma ivdep |
290 |
for (i0=istart;i0<iend;i0++) { |
for (i0=istart;i0<iend;i0++) { |
291 |
r[i0]-=alpha*v[i0]; |
r[i0]-=alpha*v[i0]; |
293 |
ss+=d*d; |
ss+=d*d; |
294 |
ss1+=d*rs[i0]; |
ss1+=d*rs[i0]; |
295 |
} |
} |
|
#pragma critical |
|
|
{ |
|
|
sum_3+=ss; |
|
|
sum_4+=ss1; |
|
|
} |
|
296 |
#ifdef USE_DYNAMIC_SCHEDULING |
#ifdef USE_DYNAMIC_SCHEDULING |
297 |
} |
} |
298 |
#else |
#else |
299 |
} |
} |
300 |
#endif |
#endif |
301 |
|
#pragma critical |
302 |
|
{ |
303 |
|
sum_3+=ss; |
304 |
|
sum_4+=ss1; |
305 |
|
} |
306 |
} |
} |
307 |
#ifdef PASO_MPI |
#ifdef PASO_MPI |
308 |
loc_sum[0] = sum_3; |
loc_sum[0] = sum_3; |
312 |
sum_4=sum[1]; |
sum_4=sum[1]; |
313 |
#endif |
#endif |
314 |
sum_5 = 0; |
sum_5 = 0; |
315 |
#pragma omp parallel (i0, istart, iend, ipp, ss, gamma_1,gamma_2) |
#pragma omp parallel private(i0, istart, iend, ipp, ss, gamma_1,gamma_2) |
316 |
{ |
{ |
317 |
gamma_1= ( (ABS(sum_3)<= ZERO) ? 0 : -sum_4/sum_3) ; |
gamma_1= ( (ABS(sum_3)<= ZERO) ? 0 : -sum_4/sum_3) ; |
318 |
gamma_2= ONE-gamma_1; |
gamma_2= ONE-gamma_1; |
319 |
|
ss=0; |
320 |
#ifdef USE_DYNAMIC_SCHEDULING |
#ifdef USE_DYNAMIC_SCHEDULING |
321 |
#pragma omp for schedule(dynamic, 1) |
#pragma omp for schedule(dynamic, 1) |
322 |
for (ipp=0; ipp < n_chunks; ++ipp) { |
for (ipp=0; ipp < n_chunks; ++ipp) { |
328 |
istart=len*ipp+MIN(ipp,rest); |
istart=len*ipp+MIN(ipp,rest); |
329 |
iend=len*(ipp+1)+MIN(ipp+1,rest); |
iend=len*(ipp+1)+MIN(ipp+1,rest); |
330 |
#endif |
#endif |
|
ss=0; |
|
331 |
#pragma ivdep |
#pragma ivdep |
332 |
for (i0=istart;i0<iend;i0++) { |
for (i0=istart;i0<iend;i0++) { |
333 |
rs[i0]=gamma_2*rs[i0]+gamma_1*r[i0]; |
rs[i0]=gamma_2*rs[i0]+gamma_1*r[i0]; |
335 |
x[i0]=gamma_2*x[i0]+gamma_1*x2[i0]; |
x[i0]=gamma_2*x[i0]+gamma_1*x2[i0]; |
336 |
ss+=rs[i0]*rs[i0]; |
ss+=rs[i0]*rs[i0]; |
337 |
} |
} |
|
#pragma omp critical |
|
|
sum_5+=ss; |
|
338 |
#ifdef USE_DYNAMIC_SCHEDULING |
#ifdef USE_DYNAMIC_SCHEDULING |
339 |
} |
} |
340 |
#else |
#else |
341 |
} |
} |
342 |
#endif |
#endif |
343 |
|
#pragma omp critical |
344 |
|
{ |
345 |
|
sum_5+=ss; |
346 |
|
} |
347 |
} |
} |
348 |
#ifdef PASO_MPI |
#ifdef PASO_MPI |
349 |
loc_sum[0] = sum_5; |
loc_sum[0] = sum_5; |