Rozwiązywanie liniowych układów AX = B z CUDA

Czy mogę korzystać z nowej biblioteki CUDA cuSOLVER (7) do rozwiązywania układów liniowych postaciRozwiązywanie liniowych układów AX = B z CUDA

AX = B

gdzie A, X i B są NxN gęste macierze?

Źródło

2015-03-01 Shay S.

Tak. W ramach cuSOLVER możesz użyć rozkładu QR, zobacz [Rozkład QR do rozwiązywania układów liniowych w CUDA] (http://stackoverflow.com/questions/22399794/qr-decomposition-to-solve-linear-systems-in-cuda). Alternatywnie, możesz obliczyć odwrotność macierzy przez kolejną inklinację 'cublas getrfBatched()' (która oblicza dekompozycję LU macierzy) i 'cublas getriBatched()' (która oblicza odwrotność macierzy zaczynając od jej LU rozkład). – JackOLantern

Ostatnią możliwością jest użycie 'cublas getrfBatched()', po której następuje dwukrotne wywołanie 'cublas trsm()' (która rozwiązuje górne lub dolne trójkątne systemy liniowe). – JackOLantern

Tak.

Podejście nr. 1

W ramach cuSOLVER można użyć rozkładu QR, patrz QR decomposition to solve linear systems in CUDA.

Podejście nr. 2

Alternatywnie, można obliczyć odwrotność macierzy przez kolejne involation z

cublas<t>getrfBatched()

który dokonuje dekompozycji logicznej matrycy i

cublas<t>getriBatched()

który oblicza odwrotność macierzy począwszy od dekompozycji LU.

Podejście nr. 3

Ostateczna możliwością jest użycie

cublas<t>getrfBatched()

następnie dwukrotnym wywołaniu

cublas<t>trsm()

który rozwiązuje górną lub dolną trójkątne układów liniowych.

Jak zauważył Robert Crovella, odpowiedź może być różna w zależności od rozmiaru i rodzaju zaangażowanych macierzy.

Kod dla podejścia nr. 1

Proszę zobaczyć QR decomposition to solve linear systems in CUDA.

Kod dla podejść nr. 2 i nr. 3

Poniżej przedstawiam przykład pracy dla realizacji podejść nr. 2 i 3. Hankel matrices są używane do zasilania podejść z dobrze kondycjonowanymi, odwracalnymi matrycami. Proszę zauważyć, że podejście nr. 3 wymaga permutacji (zmiany układu) wektora współczynników systemowych zgodnie z tablicą przestawną uzyskaną po wywołaniu cublas<t>getrfBatched(). Ta permutacja może być wygodnie wykonana na procesorze.

#include <stdio.h> 
#include <fstream> 
#include <iomanip> 
#include <stdlib.h>  /* srand, rand */ 
#include <time.h>  /* time */ 

#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 

#include "cublas_v2.h" 

#include "Utilities.cuh" 
#include "TimingGPU.cuh" 

#define prec_save 10 

#define BLOCKSIZE 256 

#define BLOCKSIZEX 16 
#define BLOCKSIZEY 16 

/************************************/ 
/* SAVE REAL ARRAY FROM CPU TO FILE */ 
/************************************/ 
template <class T> 
void saveCPUrealtxt(const T * h_in, const char *filename, const int M) { 

    std::ofstream outfile; 
    outfile.open(filename); 
    for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n"; 
    outfile.close(); 

} 

/************************************/ 
/* SAVE REAL ARRAY FROM GPU TO FILE */ 
/************************************/ 
template <class T> 
void saveGPUrealtxt(const T * d_in, const char *filename, const int M) { 

    T *h_in = (T *)malloc(M * sizeof(T)); 

    gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost)); 

    std::ofstream outfile; 
    outfile.open(filename); 
    for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n"; 
    outfile.close(); 

} 

/***************************************************/ 
/* FUNCTION TO SET THE VALUES OF THE HANKEL MATRIX */ 
/***************************************************/ 
// --- https://en.wikipedia.org/wiki/Hankel_matrix 
void setHankelMatrix(double * __restrict h_A, const int N) { 

    double *h_atemp = (double *)malloc((2 * N - 1) * sizeof(double)); 

    // --- Initialize random seed 
    srand(time(NULL)); 

    // --- Generate random numbers 
    for (int k = 0; k < 2 * N - 1; k++) h_atemp[k] = rand(); 

    // --- Fill the Hankel matrix. The Hankel matrix is symmetric, so filling by row or column is equivalent. 
    for (int i = 0; i < N; i++) 
     for (int j = 0; j < N; j++) 
      h_A[i * N + j] = h_atemp[(i + 1) + (j + 1) - 2]; 

    free(h_atemp); 

} 

/***********************************************/ 
/* FUNCTION TO COMPUTE THE COEFFICIENTS VECTOR */ 
/***********************************************/ 
void computeCoefficientsVector(const double * __restrict h_A, const double * __restrict h_xref, 
           double * __restrict h_y, const int N) { 

    for (int k = 0; k < N; k++) h_y[k] = 0.f; 

    for (int m = 0; m < N; m++) 
     for (int n = 0; n < N; n++) 
      h_y[m] = h_y[m] + h_A[n * N + m] * h_xref[n]; 

} 

/************************************/ 
/* COEFFICIENT REARRANGING FUNCTION */ 
/************************************/ 
void rearrange(double *vec, int *pivotArray, int N){ 
    for (int i = 0; i < N; i++) { 
     double temp = vec[i]; 
     vec[i] = vec[pivotArray[i] - 1]; 
     vec[pivotArray[i] - 1] = temp; 
    } 
} 

/********/ 
/* MAIN */ 
/********/ 
int main() { 

    const unsigned int N = 1000; 

    const unsigned int Nmatrices = 1; 

    // --- CUBLAS initialization 
    cublasHandle_t cublas_handle; 
    cublasSafeCall(cublasCreate(&cublas_handle)); 

    TimingGPU timerLU, timerApproach1, timerApproach2; 
    double timingLU, timingApproach1, timingApproach2; 

    /***********************/ 
    /* SETTING THE PROBLEM */ 
    /***********************/ 
    // --- Matrices to be inverted (only one in this example) 
    double *h_A = (double *)malloc(N * N * Nmatrices * sizeof(double)); 

    // --- Setting the Hankel matrix 
    setHankelMatrix(h_A, N); 

    // --- Defining the solution 
    double *h_xref = (double *)malloc(N * sizeof(double)); 
    for (int k = 0; k < N; k++) h_xref[k] = 1.f; 

    // --- Coefficient vectors (only one in this example) 
    double *h_y = (double *)malloc(N * sizeof(double)); 

    computeCoefficientsVector(h_A, h_xref, h_y, N); 

    // --- Result (only one in this example) 
    double *h_x = (double *)malloc(N * sizeof(double)); 

    // --- Allocate device space for the input matrices 
    double *d_A; gpuErrchk(cudaMalloc(&d_A, N * N * Nmatrices * sizeof(double))); 
    double *d_y; gpuErrchk(cudaMalloc(&d_y, N *     sizeof(double))); 
    double *d_x; gpuErrchk(cudaMalloc(&d_x, N *     sizeof(double))); 

    // --- Move the relevant matrices from host to device 
    gpuErrchk(cudaMemcpy(d_A, h_A, N * N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice)); 
    gpuErrchk(cudaMemcpy(d_y, h_y, N *     sizeof(double), cudaMemcpyHostToDevice)); 

    /**********************************/ 
    /* COMPUTING THE LU DECOMPOSITION */ 
    /**********************************/ 
    timerLU.StartCounter(); 

    // --- Creating the array of pointers needed as input/output to the batched getrf 
    double **h_inout_pointers = (double **)malloc(Nmatrices * sizeof(double *)); 
    for (int i = 0; i < Nmatrices; i++) h_inout_pointers[i] = d_A + i * N * N; 

    double **d_inout_pointers; 
    gpuErrchk(cudaMalloc(&d_inout_pointers, Nmatrices * sizeof(double *))); 
    gpuErrchk(cudaMemcpy(d_inout_pointers, h_inout_pointers, Nmatrices * sizeof(double *), cudaMemcpyHostToDevice)); 
    free(h_inout_pointers); 

    int *d_pivotArray; gpuErrchk(cudaMalloc(&d_pivotArray, N * Nmatrices * sizeof(int))); 
    int *d_InfoArray; gpuErrchk(cudaMalloc(&d_InfoArray,  Nmatrices * sizeof(int))); 

    int *h_InfoArray = (int *)malloc(Nmatrices * sizeof(int)); 

    cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, d_pivotArray, d_InfoArray, Nmatrices)); 
    //cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, NULL, d_InfoArray, Nmatrices)); 

    gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost)); 

    for (int i = 0; i < Nmatrices; i++) 
     if (h_InfoArray[i] != 0) { 
      fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i); 
      cudaDeviceReset(); 
      exit(EXIT_FAILURE); 
     } 

    timingLU = timerLU.GetCounter(); 
    printf("Timing LU decomposition %f [ms]\n", timingLU); 

    /*********************************/ 
    /* CHECKING THE LU DECOMPOSITION */ 
    /*********************************/ 
    saveCPUrealtxt(h_A,   "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\A.txt", N * N); 
    saveCPUrealtxt(h_y,   "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\y.txt", N); 
    saveGPUrealtxt(d_A,   "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\Adecomposed.txt", N * N); 
    saveGPUrealtxt(d_pivotArray, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\pivotArray.txt", N); 

    /******************************************************************************/ 
    /* APPROACH NR.1: COMPUTE THE INVERSE OF A STARTING FROM ITS LU DECOMPOSITION */ 
    /******************************************************************************/ 
    timerApproach1.StartCounter(); 

    // --- Allocate device space for the inverted matrices 
    double *d_Ainv; gpuErrchk(cudaMalloc(&d_Ainv, N * N * Nmatrices * sizeof(double))); 

    // --- Creating the array of pointers needed as output to the batched getri 
    double **h_out_pointers = (double **)malloc(Nmatrices * sizeof(double *)); 
    for (int i = 0; i < Nmatrices; i++) h_out_pointers[i] = (double *)((char*)d_Ainv + i * ((size_t)N * N) * sizeof(double)); 

    double **d_out_pointers; 
    gpuErrchk(cudaMalloc(&d_out_pointers, Nmatrices*sizeof(double *))); 
    gpuErrchk(cudaMemcpy(d_out_pointers, h_out_pointers, Nmatrices*sizeof(double *), cudaMemcpyHostToDevice)); 
    free(h_out_pointers); 

    cublasSafeCall(cublasDgetriBatched(cublas_handle, N, (const double **)d_inout_pointers, N, d_pivotArray, d_out_pointers, N, d_InfoArray, Nmatrices)); 

    gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost)); 

    for (int i = 0; i < Nmatrices; i++) 
     if (h_InfoArray[i] != 0) { 
     fprintf(stderr, "Inversion of matrix %d Failed: Matrix may be singular\n", i); 
     cudaDeviceReset(); 
     exit(EXIT_FAILURE); 
     } 

    double alpha1 = 1.f; 
    double beta1 = 0.f; 

    cublasSafeCall(cublasDgemv(cublas_handle, CUBLAS_OP_N, N, N, &alpha1, d_Ainv, N, d_y, 1, &beta1, d_x, 1)); 

    timingApproach1 = timingLU + timerApproach1.GetCounter(); 
    printf("Timing approach 1 %f [ms]\n", timingApproach1); 

    /**************************/ 
    /* CHECKING APPROACH NR.1 */ 
    /**************************/ 
    saveGPUrealtxt(d_x, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\xApproach1.txt", N); 

    /*************************************************************/ 
    /* APPROACH NR.2: INVERT UPPER AND LOWER TRIANGULAR MATRICES */ 
    /*************************************************************/ 
    timerApproach2.StartCounter(); 

    double *d_P; gpuErrchk(cudaMalloc(&d_P, N * N * sizeof(double))); 

    gpuErrchk(cudaMemcpy(h_y, d_y, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost)); 
    int *h_pivotArray = (int *)malloc(N * Nmatrices*sizeof(int)); 
    gpuErrchk(cudaMemcpy(h_pivotArray, d_pivotArray, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost)); 

    rearrange(h_y, h_pivotArray, N); 
    gpuErrchk(cudaMemcpy(d_y, h_y, N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice)); 

    // --- Now P*A=L*U 
    //  Linear system A*x=y => P.'*L*U*x=y => L*U*x=P*y 

    // --- 1st phase - solve Ly = b 
    const double alpha = 1.f; 

    // --- Function solves the triangular linear system with multiple right hand sides, function overrides b as a result 

    // --- Lower triangular part 
    cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, N, 1, &alpha, d_A, N, d_y, N)); 

    // --- Upper triangular part 
    cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, 1, &alpha, d_A, N, d_y, N)); 

    timingApproach2 = timingLU + timerApproach2.GetCounter(); 
    printf("Timing approach 2 %f [ms]\n", timingApproach2); 

    /**************************/ 
    /* CHECKING APPROACH NR.2 */ 
    /**************************/ 
    saveGPUrealtxt(d_y, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\xApproach2.txt", N); 

    return 0; 
}

W Utilities.cu i Utilities.cuh pliki potrzebne do uruchomienia takiego przykładu są utrzymywane na tym github page. Pliki TimingGPU.cu i TimingGPU.cuh są przechowywane pod tym numerem github page.

Kilka przydatnych odnośników na trzecim podejściu:

NAG Fortran Library Routine Document

Scientific Computing Software Library (SCSL) User’s Guide

https://www.cs.drexel.edu/~jjohnson/2010-11/summer/cs680/programs/lapack/Danh/verify_sequential.c

EDIT

taktowanie (w ms) dla podejścia nr. 2 i 3 (testy wykonywane na karcie GTX960, cc 5.2).

N  LU decomposition  Approach nr. 2  Approach nr. 3 
100  1.08     2.75     1.28 
500  45.4     161     45.7 
1000  302     1053     303

Jak się okazuje, podejście nr. 3 jest wygodniejszy, a jego koszt jest zasadniczo kosztem obliczania faktoryzacji LU. Ponadto:

rozwiązywania układów liniowych rozkładu LU jest większa niż przy użyciu rozkładu QR (patrz QR decomposition to solve linear systems in CUDA);
Rozkład LU jest ograniczony do kwadratowych systemów liniowych, a rozkład QR pomaga w przypadku nie kwadratowych systemów liniowych.

Poniższa Matlab kod może być używany do sprawdzania wyników

clear all 
close all 
clc 

warning off 

N = 1000; 

% --- Setting the problem solution 
x = ones(N, 1); 

%%%%%%%%%%%%%%%%%%%%% 
% NxN HANKEL MATRIX % 
%%%%%%%%%%%%%%%%%%%%% 
% --- https://en.wikipedia.org/wiki/Hankel_matrix 
load A.txt 
load y.txt 

A = reshape(A, N, N); 

yMatlab = A * x; 
fprintf('Percentage rms between coefficients vectors in Matlab and CUDA %f\n', 100 * sqrt(sum(sum(abs(yMatlab - y).^2))/sum(sum(abs(yMatlab).^2)))); 

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
% COMPUTATION OF THE LU DECOMPOSITION % 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
[Lmatlab, Umatlab] = lu(A); 

load Adecomposed.txt 
Adecomposed = reshape(Adecomposed, N, N); 

L = eye(N); 
for k = 1 : N 
    L(k + 1 : N, k) = Adecomposed(k + 1 : N, k); 
end 
U = zeros(N); 
for k = 1 : N 
    U(k, k : N) = Adecomposed(k, k : N); 
end 

load pivotArray.txt 
Pj = eye(N); 
for j = 1 : N 
    tempVector = Pj(j, :); 
    Pj(j, :) = Pj(pivotArray(j), :); 
    Pj(pivotArray(j), :) = tempVector; 
end 

fprintf('Percentage rms between Pj * A and L * U in CUDA %f\n', 100 * sqrt(sum(sum(abs(Pj * A - L * U).^2))/sum(sum(abs(Pj * A).^2)))); 

xprime  = inv(Lmatlab) * yMatlab; 
xMatlab  = inv(Umatlab) * xprime; 

fprintf('Percentage rms between reference solution and solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xMatlab - x).^2))/sum(sum(abs(x).^2)))); 

load xApproach1.txt 
fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.1 %f\n', 100 * sqrt(sum(sum(abs(xApproach1 - x).^2))/sum(sum(abs(x).^2)))); 

load xApproach2.txt 
fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.2 %f\n', 100 * sqrt(sum(sum(abs(xApproach2 - x).^2))/sum(sum(abs(x).^2))));

Źródło

2015-03-01 21:45:39 JackOLantern

Rozwiązywanie liniowych układów AX = B z CUDA

Odpowiedz

Powiązane problemy