137 |
istart=len*ipp+MIN(ipp,rest); |
istart=len*ipp+MIN(ipp,rest); |
138 |
iend=len*(ipp+1)+MIN(ipp+1,rest); |
iend=len*(ipp+1)+MIN(ipp+1,rest); |
139 |
#endif |
#endif |
140 |
#pragma ivdep |
#pragma ivdep |
141 |
for (i0=istart;i0<iend;i0++) { |
for (i0=istart;i0<iend;i0++) { |
142 |
rs[i0]=r[i0]; |
rs[i0]=r[i0]; |
143 |
x2[i0]=x[i0]; |
x2[i0]=x[i0]; |
144 |
p[i0]=0; |
p[i0]=0; |
145 |
v[i0]=0; |
v[i0]=0; |
146 |
} |
} |
147 |
#ifdef USE_DYNAMIC_SCHEDULING |
#ifdef USE_DYNAMIC_SCHEDULING |
148 |
} |
} |
149 |
#else |
#else |
171 |
istart=chunk_size*ipp; |
istart=chunk_size*ipp; |
172 |
iend=MIN(istart+chunk_size,n); |
iend=MIN(istart+chunk_size,n); |
173 |
#else |
#else |
174 |
#pragma omp for schedule(static) |
#pragma omp for schedule(static) |
175 |
for (ipp=0; ipp <np; ++ipp) { |
for (ipp=0; ipp <np; ++ipp) { |
176 |
istart=len*ipp+MIN(ipp,rest); |
istart=len*ipp+MIN(ipp,rest); |
177 |
iend=len*(ipp+1)+MIN(ipp+1,rest); |
iend=len*(ipp+1)+MIN(ipp+1,rest); |
178 |
#endif |
#endif |
179 |
#pragma ivdep |
#pragma ivdep |
180 |
for (i0=istart;i0<iend;i0++) ss+=v[i0]*r[i0]; |
for (i0=istart;i0<iend;i0++) ss+=v[i0]*r[i0]; |
181 |
#ifdef USE_DYNAMIC_SCHEDULING |
#ifdef USE_DYNAMIC_SCHEDULING |
182 |
} |
} |
183 |
#else |
#else |
184 |
} |
} |
185 |
#endif |
#endif |
186 |
#pragma critical |
#pragma omp critical |
187 |
{ |
{ |
188 |
sum_1+=ss; |
sum_1+=ss; |
189 |
} |
} |
190 |
} |
} |
191 |
#ifdef PASO_MPI |
#ifdef PASO_MPI |
210 |
istart=len*ipp+MIN(ipp,rest); |
istart=len*ipp+MIN(ipp,rest); |
211 |
iend=len*(ipp+1)+MIN(ipp+1,rest); |
iend=len*(ipp+1)+MIN(ipp+1,rest); |
212 |
#endif |
#endif |
213 |
if (num_iter==1) { |
if (num_iter==1) { |
214 |
#pragma ivdep |
#pragma ivdep |
215 |
for (i0=istart;i0<iend;i0++) p[i0]=v[i0]; |
for (i0=istart;i0<iend;i0++) p[i0]=v[i0]; |
216 |
} else { |
} else { |
217 |
beta=tau/tau_old; |
beta=tau/tau_old; |
218 |
#pragma ivdep |
#pragma ivdep |
219 |
for (i0=istart;i0<iend;i0++) p[i0]=v[i0]+beta*p[i0]; |
for (i0=istart;i0<iend;i0++) p[i0]=v[i0]+beta*p[i0]; |
220 |
} |
} |
221 |
#ifdef USE_DYNAMIC_SCHEDULING |
#ifdef USE_DYNAMIC_SCHEDULING |
222 |
} |
} |
223 |
#else |
#else |
247 |
istart=len*ipp+MIN(ipp,rest); |
istart=len*ipp+MIN(ipp,rest); |
248 |
iend=len*(ipp+1)+MIN(ipp+1,rest); |
iend=len*(ipp+1)+MIN(ipp+1,rest); |
249 |
#endif |
#endif |
250 |
#pragma ivdep |
#pragma ivdep |
251 |
for (i0=istart;i0<iend;i0++) ss+=v[i0]*p[i0]; |
for (i0=istart;i0<iend;i0++) ss+=v[i0]*p[i0]; |
252 |
#ifdef USE_DYNAMIC_SCHEDULING |
#ifdef USE_DYNAMIC_SCHEDULING |
253 |
} |
} |
254 |
#else |
#else |
255 |
} |
} |
256 |
#endif |
#endif |
257 |
#pragma critical |
#pragma omp critical |
258 |
{ |
{ |
259 |
sum_2+=ss; |
sum_2+=ss; |
260 |
} |
} |
261 |
} |
} |
262 |
#ifdef PASO_MPI |
#ifdef PASO_MPI |
264 |
MPI_Allreduce(loc_sum, &sum_2, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm); |
MPI_Allreduce(loc_sum, &sum_2, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm); |
265 |
#endif |
#endif |
266 |
delta=sum_2; |
delta=sum_2; |
267 |
|
alpha=tau/delta; |
268 |
|
|
269 |
if (! (breakFlag = (ABS(delta) <= TOLERANCE_FOR_SCALARS))) { |
if (! (breakFlag = (ABS(delta) <= TOLERANCE_FOR_SCALARS))) { |
270 |
/* smoother */ |
/* smoother */ |
271 |
sum_3 = 0; |
sum_3 = 0; |
272 |
sum_4 = 0; |
sum_4 = 0; |
273 |
#pragma omp parallel private(i0, istart, iend, ipp,d, ss, ss1, alpha) |
#pragma omp parallel private(i0, istart, iend, ipp,d, ss, ss1) |
274 |
{ |
{ |
275 |
ss=0; |
ss=0; |
276 |
ss1=0; |
ss1=0; |
|
alpha=tau/delta; |
|
277 |
#ifdef USE_DYNAMIC_SCHEDULING |
#ifdef USE_DYNAMIC_SCHEDULING |
278 |
#pragma omp for schedule(dynamic, 1) |
#pragma omp for schedule(dynamic, 1) |
279 |
for (ipp=0; ipp < n_chunks; ++ipp) { |
for (ipp=0; ipp < n_chunks; ++ipp) { |
285 |
istart=len*ipp+MIN(ipp,rest); |
istart=len*ipp+MIN(ipp,rest); |
286 |
iend=len*(ipp+1)+MIN(ipp+1,rest); |
iend=len*(ipp+1)+MIN(ipp+1,rest); |
287 |
#endif |
#endif |
288 |
#pragma ivdep |
#pragma ivdep |
289 |
for (i0=istart;i0<iend;i0++) { |
for (i0=istart;i0<iend;i0++) { |
290 |
r[i0]-=alpha*v[i0]; |
r[i0]-=alpha*v[i0]; |
291 |
d=r[i0]-rs[i0]; |
d=r[i0]-rs[i0]; |
292 |
ss+=d*d; |
ss+=d*d; |
293 |
ss1+=d*rs[i0]; |
ss1+=d*rs[i0]; |
294 |
} |
} |
295 |
#ifdef USE_DYNAMIC_SCHEDULING |
#ifdef USE_DYNAMIC_SCHEDULING |
296 |
} |
} |
297 |
#else |
#else |
298 |
} |
} |
299 |
#endif |
#endif |
300 |
#pragma critical |
#pragma omp critical |
301 |
{ |
{ |
302 |
sum_3+=ss; |
sum_3+=ss; |
303 |
sum_4+=ss1; |
sum_4+=ss1; |
304 |
} |
} |
305 |
} |
} |
306 |
#ifdef PASO_MPI |
#ifdef PASO_MPI |
327 |
istart=len*ipp+MIN(ipp,rest); |
istart=len*ipp+MIN(ipp,rest); |
328 |
iend=len*(ipp+1)+MIN(ipp+1,rest); |
iend=len*(ipp+1)+MIN(ipp+1,rest); |
329 |
#endif |
#endif |
330 |
#pragma ivdep |
#pragma ivdep |
331 |
for (i0=istart;i0<iend;i0++) { |
for (i0=istart;i0<iend;i0++) { |
332 |
rs[i0]=gamma_2*rs[i0]+gamma_1*r[i0]; |
rs[i0]=gamma_2*rs[i0]+gamma_1*r[i0]; |
333 |
x2[i0]+=alpha*p[i0]; |
x2[i0]+=alpha*p[i0]; |
334 |
x[i0]=gamma_2*x[i0]+gamma_1*x2[i0]; |
x[i0]=gamma_2*x[i0]+gamma_1*x2[i0]; |
335 |
ss+=rs[i0]*rs[i0]; |
ss+=rs[i0]*rs[i0]; |
336 |
} |
} |
337 |
#ifdef USE_DYNAMIC_SCHEDULING |
#ifdef USE_DYNAMIC_SCHEDULING |
338 |
} |
} |
339 |
#else |
#else |
341 |
#endif |
#endif |
342 |
#pragma omp critical |
#pragma omp critical |
343 |
{ |
{ |
344 |
sum_5+=ss; |
sum_5+=ss; |
345 |
} |
} |
346 |
} |
} |
347 |
#ifdef PASO_MPI |
#ifdef PASO_MPI |