Mnożniki prostokątne w CUDA

W tej pracy domowej muszę wypełnić kod, aby pomnożyć dwie prostokątne macierze za pomocą CUDA C. Po zakończeniu kodu, przesłałem i rozwiązanie było poprawne dla zestawu danych, gdy macierze były kwadratowe, natomiast wynik nie był zgodny z oczekiwaną wartością, gdy macierze nie były kwadratowe.Mnożniki prostokątne w CUDA

Oto kod po Dodałem brakujące części:

#include <wb.h> 

#define wbCheck(stmt) do {        \ 
    cudaError_t err = stmt;       \ 
    if (err != cudaSuccess) {       \ 
     wbLog(ERROR, "Failed to run stmt ", #stmt); \ 
     return -1;          \ 
    }             \ 
} while(0) 

// Compute C = A * B 
__global__ void matrixMultiply(float * A, float * B, float * C, 
       int numARows, int numAColumns, 
       int numBRows, int numBColumns, 
       int numCRows, int numCColumns) { 
    //@@ Insert code to implement matrix multiplication here 
    int Row = blockIdx.y * blockDim.y + threadIdx.y; 
    int Col = blockIdx.x * blockDim.x + threadIdx.x; 
    if (numAColumns != numBRows) return ; 
    if ((Row < numARows) && (Col < numBColumns)){ 
     float Cvalue = 0; 
     for (int k = 0 ; k < numAColumns ; ++k) 
     Cvalue += A[Row*numAColumns + k] * B[k * numBRows + Col]; 
     C[Row*numAColumns + Col] = Cvalue; 
    } 

    } 



int main(int argc, char ** argv) { 
    wbArg_t args; 
    float * hostA; // The A matrix 
    float * hostB; // The B matrix 
    float * hostC; // The output C matrix 
    float * deviceA; 
    float * deviceB; 
    float * deviceC; 
    int numARows; // number of rows in the matrix A 
    int numAColumns; // number of columns in the matrix A 
    int numBRows; // number of rows in the matrix B 
    int numBColumns; // number of columns in the matrix B 
    int numCRows; // number of rows in the matrix C (you have to set this) 
    int numCColumns; // number of columns in the matrix C (you have to set this) 

    args = wbArg_read(argc, argv); 

    wbTime_start(Generic, "Importing data and creating memory on host"); 
    hostA = (float *) wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns); 
    hostB = (float *) wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns); 
    //@@ Set numCRows and numCColumns 
    numCRows = 0; 
    numCColumns = 0; 
    numCRows = numARows; 
    numCColumns = numBColumns; 
    //@@ Allocate the hostC matrix 
    hostC = (float*) malloc(sizeof(float)*numCRows*numCColumns); 
    wbTime_stop(Generic, "Importing data and creating memory on host"); 

    wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns); 
    wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns); 

    wbTime_start(GPU, "Allocating GPU memory."); 
    //@@ Allocate GPU memory here 
    cudaMalloc((void**)&deviceA ,sizeof(float)*numARows*numAColumns); 
    cudaMalloc((void**)&deviceB , sizeof(float)*numBRows*numBColumns); 
    cudaMalloc((void**)&deviceC , sizeof(float)*numCRows*numCColumns); 

    wbTime_stop(GPU, "Allocating GPU memory."); 

    wbTime_start(GPU, "Copying input memory to the GPU."); 
    //@@ Copy memory to the GPU here 

    cudaMemcpy(deviceA, hostA, sizeof(float)*numARows*numAColumns, cudaMemcpyHostToDevice); 
    cudaMemcpy(deviceB, hostB, sizeof(float)*numBRows*numBColumns, cudaMemcpyHostToDevice); 
    wbTime_stop(GPU, "Copying input memory to the GPU."); 

    //@@ Initialize the grid and block dimensions here 

    dim3 DimGrid(numARows/8 , numBColumns/8, 1); 
    dim3 DimBlock(8 , 8, 1); 

    wbTime_start(Compute, "Performing CUDA computation"); 

    //@@ Launch the GPU Kernel here 
    matrixMultiply<<<DimGrid , DimBlock>>>(deviceA , deviceB , deviceC , numARows , numAColumns, numBRows ,numBColumns , numCRows , numCColumns); 

    cudaThreadSynchronize(); 
    wbTime_stop(Compute, "Performing CUDA computation"); 

    wbTime_start(Copy, "Copying output memory to the CPU"); 
    //@@ Copy the GPU memory back to the CPU here 
    cudaMemcpy(hostC, deviceC, sizeof(float)*numCRows*numCColumns , cudaMemcpyDeviceToHost); 

    wbTime_stop(Copy, "Copying output memory to the CPU"); 

    wbTime_start(GPU, "Freeing GPU Memory"); 
    //@@ Free the GPU memory here 

    cudaFree(deviceA); 
    cudaFree(deviceB); 
    cudaFree(deviceC); 
    wbTime_stop(GPU, "Freeing GPU Memory"); 

    wbSolution(args, hostC, numCRows, numCColumns); 

    free(hostA); 
    free(hostB); 
    free(hostC); 

    return 0; 
}

Mam nadzieję, że może mi pomóc dowiedzieć się, jakie jest to nieprawidłowe.

Źródło

2012-12-15 mzn.rft

wymienić:

Cvalue += A[Row*numAColumns + k] * B[k * numBRows + Col];

Cvalue += A[Row*numAColumns + k] * B[k * numBColumns + Col];

Źródło

2012-12-16 06:12:27 ahmad

hi Ahmad, dziękuję za próbę mi pomóc dowiedzieć się, jakie jest to błędne, a po wykonaniu twojej rady, stwierdziliśmy, że czas przetwarzania było znacznie lepiej, ale nadal nie wynik oczekiwany wynik na przykład jeden zestaw danych pokazuje, że: Rozwiązanie nie dorównało oczekiwanym wynikom w kolumnie 124 i wierszu 0. Oczekiwano 457,153, ale uzyskało 422,296. teraz zgadzam się z tobą musi być jakiś błąd w funkcji matrixMultiply, może muszę zmienić coś innego. –

wymienić: for (int k = 0 ; k < numAColumns ; ++k) Cvalue += A[Row*numAColumns + k] * B[k * numBRows + Col]; C[Row*numAColumns + Col] = Cvalue; }

z for (int k = 0 ; k < numAColumns ; ++k) Cvalue += A[Row*numAColumns + k] * B[k * numBColumns + Col]; C[Row*numCColumns + Col] = Cvalue; }

Źródło

2012-12-16 10:14:21 Ira

dziękuję Ira, z twoją sugestią mam jeszcze jeden zestaw danych, aby być poprawnym, ale nadal nie jestem w stanie uzyskać w pełni oczekiwanego wyniku dla wszystkich zestawów danych, na przykład dostałem tę sprawę: Rozwiązanie nie pasowało do oczekiwanych rezultatów w kolumnie 200 i wierszu 0. Oczekiwano 415.556, ale uzyskały 0.807. Myślę, że masz rację, jest to coś w funkcji matrixMultiply. –

wymienić

Cvalue += A[Row*numAColumns + k] * B[k * numBRows + Col];

dla

Cvalue += A[Row*numAColumns +k]* B[k*numBColumns+Col];

C[Row*numAColumns + Col] = Cvalue;

dla

C[Row*numCColumns+Col] = Cvalue;

Źródło

2012-12-16 12:35:47 ram

dziękuję ram, twoje poprawki są prawdziwe, są podobne do tego, o którym wspomnieli wcześniej Ahmad i Ira, nawet zrobiłem te poprawki, ale nadal nie mogę uzyskać poprawnego wyniku. –

Po pomocą Ira Ahmad, RAM i Oli Fly, mam prawidłową odpowiedź jako to:

#include <wb.h> 

#define wbCheck(stmt) do {         \ 
     cudaError_t err = stmt;       \ 
     if (err != cudaSuccess) {       \ 
      wbLog(ERROR, "Failed to run stmt ", #stmt); \ 
      return -1;          \ 
     }             \ 
    } while(0) 

// Compute C = A * B 
__global__ void matrixMultiply(float * A, float * B, float * C, 
        int numARows, int numAColumns, 
        int numBRows, int numBColumns, 
        int numCRows, int numCColumns) { 
    //@@ Insert code to implement matrix multiplication here 
    int Row = blockIdx.y * blockDim.y + threadIdx.y; 
    int Col = blockIdx.x * blockDim.x + threadIdx.x; 
    if (numAColumns != numBRows) return; 
    if ((Row < numARows) && (Col < numBColumns)){ 
    float Cvalue = 0; 
    for (int k = 0; k < numAColumns; ++k) 
    Cvalue += A[Row*numAColumns + k] * B[k * numBColumns + Col]; 
    C[Row*numCColumns + Col] = Cvalue; 
    } 

} 

int main(int argc, char ** argv) { 
    wbArg_t args; 
    float * hostA; // The A matrix 
    float * hostB; // The B matrix 
    float * hostC; // The output C matrix 
    float * deviceA; 
    float * deviceB; 
    float * deviceC; 
    int numARows; // number of rows in the matrix A 
    int numAColumns; // number of columns in the matrix A 
    int numBRows; // number of rows in the matrix B 
    int numBColumns; // number of columns in the matrix B 
    int numCRows; // number of rows in the matrix C (you have to set this) 
    int numCColumns; // number of columns in the matrix C (you have to set this) 

    args = wbArg_read(argc, argv); 

    wbTime_start(Generic, "Importing data and creating memory on host"); 
    hostA = (float *) wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns); 
    hostB = (float *) wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns); 
    //@@ Set numCRows and numCColumns 
    numCRows = 0; 
    numCColumns = 0; 
    numCRows = numARows; 
    numCColumns = numBColumns; 
    //@@ Allocate the hostC matrix 
    hostC = (float*) malloc(sizeof(float)*numCRows*numCColumns); 
    wbTime_stop(Generic, "Importing data and creating memory on host"); 

    wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns); 
    wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns); 

    wbTime_start(GPU, "Allocating GPU memory."); 
    //@@ Allocate GPU memory here 
    cudaMalloc((void**)&deviceA ,sizeof(float)*numARows*numAColumns); 
    cudaMalloc((void**)&deviceB , sizeof(float)*numBRows*numBColumns); 
    cudaMalloc((void**)&deviceC , sizeof(float)*numCRows*numCColumns); 

    wbTime_stop(GPU, "Allocating GPU memory."); 

    wbTime_start(GPU, "Copying input memory to the GPU."); 
    //@@ Copy memory to the GPU here 

    cudaMemcpy(deviceA, hostA, sizeof(float)*numARows*numAColumns, cudaMemcpyHostToDevice); 
    cudaMemcpy(deviceB, hostB, sizeof(float)*numBRows*numBColumns, cudaMemcpyHostToDevice); 
    wbTime_stop(GPU, "Copying input memory to the GPU."); 

    //@@ Initialize the grid and block dimensions here 

    dim3 DimGrid((numCColumns - 1)/8 + 1, (numCRows - 1)/8 + 1, 1); 
    dim3 DimBlock(8 , 8, 1); 

    wbTime_start(Compute, "Performing CUDA computation"); 

    //@@ Launch the GPU Kernel here 
    matrixMultiply<<<DimGrid , DimBlock>>>(deviceA , deviceB , deviceC , numARows , numAColumns, numBRows ,numBColumns , numCRows , numCColumns); 

    cudaThreadSynchronize(); 
    wbTime_stop(Compute, "Performing CUDA computation"); 

    wbTime_start(Copy, "Copying output memory to the CPU"); 
    //@@ Copy the GPU memory back to the CPU here 
    cudaMemcpy(hostC, deviceC, sizeof(float)*numCRows*numCColumns , cudaMemcpyDeviceToHost); 

    wbTime_stop(Copy, "Copying output memory to the CPU"); 

    wbTime_start(GPU, "Freeing GPU Memory"); 
    //@@ Free the GPU memory here 

    cudaFree(deviceA); 
    cudaFree(deviceB); 
    cudaFree(deviceC); 
    wbTime_stop(GPU, "Freeing GPU Memory"); 

    wbSolution(args, hostC, numCRows, numCColumns); 

    free(hostA); 
    free(hostB); 
    free(hostC); 

    return 0; 
}

Źródło

2012-12-16 16:28:43

Dzięki za zadanie tego pytania. Naprawdę mi to pomogło. Chciałbym zapytać, czy kiedykolwiek pracowałeś dla zestawów danych, w których wymiary matrycy NIE są wielokrotnościami 8? –

@ Abraham, niektóre zbiory danych miały następujące wymiary: (wymiary macierzy A to 200 * 100, wymiary macierzy B to 100 * 256) i był inny zestaw danych (wymiary A to 100 * 128, wymiary B to 128 * 50). –

Wierzę, że odpowiedź na # 2 jest błędna w tym kodzie, ponieważ jeden z wymiarów (100) nie jest równomiernie podzielony przez rozmiar bloku (8).Konfiguracja siatki nie uwzględnia tego. –

możemy użyć mnożenia macierzy kaflowej i znalazłem, że ma lepszy czas wykonania.

#include <wb.h> 

#define wbCheck(stmt) do {         \ 
     cudaError_t err = stmt;       \ 
     if (err != cudaSuccess) {       \ 
      wbLog(ERROR, "Failed to run stmt ", #stmt); \ 
      return -1;          \ 
     }             \ 
    } while(0) 

// Compute C = A * B 
__global__ void matrixMultiplyShared(float * A, float * B, float * C, 
          int numARows, int numAColumns, 
          int numBRows, int numBColumns, 
          int numCRows, int numCColumns) { 
    //@@ Insert code to implement matrix multiplication here 
    //@@ You have to use shared memory for this MP 
    const int TILE_WIDTH = 32; 
    __shared__ float sharedA[TILE_WIDTH][TILE_WIDTH]; 
    __shared__ float sharedB[TILE_WIDTH][TILE_WIDTH]; 
    int bx = blockIdx.x; 
    int by = blockIdx.y; 
    int tx = threadIdx.x; 
    int ty = threadIdx.y; 
    int Row = by*TILE_WIDTH + ty; 
    int Col = bx*TILE_WIDTH + tx; 
    float Cvalue = 0.0; 
    if (numAColumns != numBRows) return ; 
    for (int i = 0; i < (int)(ceil((float)numAColumns/TILE_WIDTH)); i++) 
    { 

     if (i*TILE_WIDTH + tx < numAColumns && Row < numARows){ 
      sharedA[ty][tx] = A[Row*numAColumns + i*TILE_WIDTH + tx]; 
     }else{ 
      sharedA[ty][tx] = 0.0; 
     } 

     if (i*TILE_WIDTH + ty < numBRows && Col < numBColumns){ 
      sharedB[ty][tx] = B[(i*TILE_WIDTH + ty)*numBColumns + Col]; 
     }else{ 
      sharedB[ty][tx] = 0.0; 
     } 
     __syncthreads(); 
     if(Row < numARows && Col < numBColumns){ 

      for(int j = 0; j < TILE_WIDTH; j++) 
      Cvalue += sharedA[ty][j] * sharedB[j][tx]; 
     } 

     __syncthreads(); 
    } 

    if (Row < numCRows && Col < numCColumns) 
     C[Row*numCColumns + Col] = Cvalue; 
}  




int main(int argc, char ** argv) { 
    wbArg_t args; 
    float * hostA; // The A matrix 
    float * hostB; // The B matrix 
    float * hostC; // The output C matrix 
    float * deviceA; 
    float * deviceB; 
    float * deviceC; 
    int numARows; // number of rows in the matrix A 
    int numAColumns; // number of columns in the matrix A 
    int numBRows; // number of rows in the matrix B 
    int numBColumns; // number of columns in the matrix B 
    int numCRows; // number of rows in the matrix C (you have to set this) 
    int numCColumns; // number of columns in the matrix C (you have to set this) 
    int TILE_WIDTH = 32; 

    args = wbArg_read(argc, argv); 

    wbTime_start(Generic, "Importing data and creating memory on host"); 
    hostA = (float *) wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns); 
    hostB = (float *) wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns); 
    //@@ Set numCRows and numCColumns 
    numCRows = 0; 
    numCColumns = 0; 
    numCRows = numARows; 
    numCColumns = numBColumns; 
    //@@ Allocate the hostC matrix 
    hostC = (float*) malloc(sizeof(float)*numCRows*numCColumns); 
    wbTime_stop(Generic, "Importing data and creating memory on host"); 

    wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns); 
    wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns); 

    wbTime_start(GPU, "Allocating GPU memory."); 
    //@@ Allocate GPU memory here 
    cudaMalloc((void**)&deviceA , sizeof(float)*numARows*numAColumns); 
    cudaMalloc((void**)&deviceB , sizeof(float)*numBRows*numBColumns); 
    cudaMalloc((void**)&deviceC , sizeof(float)*numCRows*numCColumns); 

    wbTime_stop(GPU, "Allocating GPU memory."); 

    wbTime_start(GPU, "Copying input memory to the GPU."); 
    //@@ Copy memory to the GPU here 
    cudaMemcpy(deviceA, hostA, sizeof(float)*numARows*numAColumns, cudaMemcpyHostToDevice); 
    cudaMemcpy(deviceB, hostB, sizeof(float)*numBRows*numBColumns, cudaMemcpyHostToDevice); 

    wbTime_stop(GPU, "Copying input memory to the GPU."); 

    //@@ Initialize the grid and block dimensions here 
    int dimX = (int)(ceil((float)numCColumns/TILE_WIDTH)); 
    int dimY = (int)(ceil((float)numCRows/TILE_WIDTH)); 
    dim3 DimGrid(dimX, dimY); 
    dim3 DimBlock(TILE_WIDTH, TILE_WIDTH); 



    wbTime_start(Compute, "Performing CUDA computation"); 
    //@@ Launch the GPU Kernel here 
    matrixMultiplyShared<<<DimGrid , DimBlock>>>(deviceA , deviceB , deviceC , numARows , numAColumns, numBRows ,numBColumns , numCRows , numCColumns); 

    cudaThreadSynchronize(); 
    wbTime_stop(Compute, "Performing CUDA computation"); 

    wbTime_start(Copy, "Copying output memory to the CPU"); 
    //@@ Copy the GPU memory back to the CPU here 
    cudaMemcpy(hostC, deviceC, sizeof(float)*numCRows*numCColumns , cudaMemcpyDeviceToHost); 

    wbTime_stop(Copy, "Copying output memory to the CPU"); 

    wbTime_start(GPU, "Freeing GPU Memory"); 
    //@@ Free the GPU memory here 
    cudaFree(deviceA); 
    cudaFree(deviceB); 
    cudaFree(deviceC); 

    wbTime_stop(GPU, "Freeing GPU Memory"); 

    wbSolution(args, hostC, numCRows, numCColumns); 

    free(hostA); 
    free(hostB); 
    free(hostC); 

    return 0; 
}

Źródło

2013-01-05 15:57:27

Mnożniki prostokątne w CUDA

Odpowiedz

Powiązane problemy