Description
Updated test results (OpenBLAS 0.3.3)
Windows 64-bit
gcc version 4.9.3 20150626 (Fedora MinGW 4.9.3-1.el7)
-
USE_THREAD=0 NUM_THREADS=1
segmentation fault -
USE_THREAD=0 NUM_THREADS=16
incorrect result -
USE_THREAD=1 NUM_THREADS=2 USE_OPENMP=1 NUM_PARALEL=16
incorrect result -
USE_THREAD=1 NUM_THREADS=2 openblas_set_num_threads(1)
incorrect result
Linux 64-bit
gcc version 4.4.7 20120313 (Red Hat 4.4.7-4)
-
USE_THREAD=0 NUM_THREADS=1
too many memory regions -
USE_THREAD=0 NUM_THREADS=16
OK
Segmentation fault if OMP_NUM_THREADS > 4, but linking to netlib-lapack is ok, so I think this may be OpenBLAS side problem. I also tested it on Linux, no segmentation fault.
GCC: x86_64-w64-mingw32-gcc 4.9.3
Single threaded OpenBLAS is built
USE_THREAD=0 DYNAMIC_ARCH=1 DYNAMIC_OLDER=0 NO_CBLAS=1 NO_LAPACKE=1 NO_SHARED=1
Test code: segfault-win.cpp
#include <cstdlib>
#include <vector>
#include <iostream>
// g++ segfault-win.cpp -lopenblas -lgfortran -lquadmath -fopenmp
#define bint int
extern "C"
void dgels_(char *trans, bint *m, bint *n, bint *nrhs, double *a, bint *lda, double *b, bint *ldb,
double *work, bint *lwork, bint *info);
extern "C" void openblas_set_num_threads(int);
extern "C" int openblas_get_num_threads();
extern "C" int omp_get_max_threads();
int C_dgels(char trans, bint m, bint n, bint nrhs, double *a, bint lda, double *b, bint ldb)
{
bint info = 0;
double wkopt;
bint lwork = -1;
dgels_(&trans, &m, &n, &nrhs, a, &lda, b, &ldb, &wkopt, &lwork, &info);
if (info == 0) {
lwork = static_cast<bint>( wkopt );
double *work = new double[lwork];
dgels_(&trans, &m, &n, &nrhs, a, &lda, b, &ldb, work, &lwork, &info);
delete[] work;
}
return info;
}
int main()
{
// USE_THREAD=1 NUM_THREADS=2
// openblas_set_num_threads(1);
int n = 300;
int q = 5;
int r = 1000;
std::vector<double> x, y;
for (int i = 0; i < n; ++i) {
y.push_back( static_cast<double>(std::rand()) / RAND_MAX );
for (int j = 0; j < q; ++j)
x.push_back( static_cast<double>(std::rand()) / RAND_MAX );
}
std::cerr << "omp_get_max_threads: " << omp_get_max_threads() << "\n";
std::cerr << "openblas_get_num_threads: " << openblas_get_num_threads() << "\n";
std::vector< std::vector<double> > z(r);
#pragma omp parallel for
for (int i = 0; i < r; ++i) {
std::vector<double> xx = x;
std::vector<double> yy = y;
C_dgels('N', n, q, 1, xx.data(), n, yy.data(), n);
z[i] = yy;
}
for (int i = 1; i < r; ++i)
std::cerr << (z[i] == z[0]);
std::cerr << "\n";
return 0;
}