// This kernel uses registers for matrix storage, shared mem. for communication.
// It also uses lazy swap.
//extern __shared__ double zdata[];
template<int N>
__device__ void
dgetf2_nopiv_device(int m, double* dA, int ldda, magma_int_t *info, const int tx, double* sx, int gbstep)
{
double rA[N] = {MAGMA_D_ZERO};
double reg = MAGMA_D_ZERO;
int linfo = 0;
double abs;
// check from previous calls if the panel factorization failed previously
// this is necessary to report the correct info value
if(gbstep > 0 && *info != 0) return;
// read
#pragma unroll
for(int i = 0; i < N; i++){
rA[i] = dA[ i * ldda + tx ];
}
#pragma unroll
for(int i = 0; i < N; i++){
if(tx == i){
#pragma unroll
for(int j = 0; j < N; j++)
sx[j] = rA[j];
}
__syncthreads();
abs = fabs(MAGMA_D_REAL( sx[i] )) + fabs(MAGMA_D_IMAG( sx[i] ));
linfo = ( abs == MAGMA_D_ZERO && linfo == 0) ? (gbstep+i+1) : linfo;
//linfo = ( abs == MAGMA_D_ZERO ) ? min(linfo,gbstep+i+1):0;
reg = (linfo == 0 ) ? MAGMA_D_DIV(MAGMA_D_ONE, sx[i] ) : MAGMA_D_ONE;
// scal and ger
if( tx > i ){
rA[i] *= reg;
#pragma unroll
for(int j = i+1; j < N; j++){
rA[j] -= rA[i] * sx[j];
}
}
__syncthreads();
}
if(tx == 0){
(*info) = (magma_int_t)( linfo );
}
// write
#pragma unroll
for(int i = 0; i < N; i++){
dA[ i * ldda + tx ] = rA[i];
}
}
A. N.