Optimizing Matrix Operations in Parallel Computing

Math Problem Statement

for (int k = 0; k < K; ++k) { for (int c = 0; c < C; ++c) { float *filters_ptr = filter + (k * C + c) * sizeF; sgemm(&G[0][0], filters_ptr, tmp_u, 4, 3, 3); sgemm(tmp_u, &G_T[0][0], u, 4, 3, 4); for (int xi = 0; xi < 4; ++xi) { int base_index = ((xi * 4) * K + k) * C + c; memcpy(&U[base_index], &u[xi * 4], 4 * sizeof(float)); } } }我们将U矩阵的存储方式改变，而后U矩阵的读取方式也要相应的改变最后V矩阵和U矩阵的计算结果要保持不变float tmp_v[16]; float d[16]; // d: [4 * 4]; float v[16]; // v: [4 * 4]; #pragma omp parallel for collapse(2) private(tmp_v, d, v) for (int n = 0; n < N; ++n) for (int c = 0; c < C; ++c) { for (int y = 0; y < outHeight / 2; ++y) { for (int x = 0; x < outWidth / 2; ++x) {

// Generate d_cb for (int iy = 0; iy < 4; ++iy) for (int ix = 0; ix < 4; ++ix) d[iy * 4 + ix] = image[(n * C + c) * sizeI + (y * 2 + iy) * inWidth + (x * 2 + ix)]; sgemm(&B_T[0][0], d, tmp_v, 4, 4, 4); sgemm(tmp_v, &B[0][0], v, 4, 4, 4); int b = ((n * outHeight / 2) + y) * outWidth / 2 + x; for (int xi = 0; xi < 4; ++xi) for (int nu = 0; nu < 4; ++nu) V[((long)(xi * 4 + nu) * C + c) * P + b] = v[xi * 4 + nu]; } } }

// M[xi, nu, :, :] = U[xi, nu, :, :] * V[xi, nu, :, :] for (int xi = 0; xi < 4; ++xi) { for (int nu = 0; nu < 4; ++nu) { float *M_ptr = M + (long)(xi * 4 + nu) * K * P; float *U_ptr = U + (long)(xi * 4 + nu) * K * C; float *V_ptr = V + (long)(xi * 4 + nu) * C * P; sgemm_parallel(U_ptr, V_ptr, M_ptr, K, C, P); } }

// Y = A_T * m * A float mm[16]; // 4 * 4 float tmp_m[8]; // 2 * 4 float temp_out[4]; // 2 * 2 for (int n = 0; n < N; ++n) for (int k = 0; k < K; ++k) { for (int y = 0; y < outHeight / 2; ++y) { for (int x = 0; x < outWidth / 2; ++x) { int b = (n * outHeight / 2 + y) * outWidth / 2 + x; for (long xi = 0; xi < 4; ++xi) { for (long nu = 0; nu < 4; ++nu) { mm[xi * 4 + nu] = M[((xi * 4 + nu) * K + k) * P + b]; } } sgemm(&A_T[0][0], mm, tmp_m, 2, 4, 4); sgemm(tmp_m, &A[0][0], temp_out, 2, 4, 2); for (int i = 0; i < 2; ++i) for (int j = 0; j < 2; ++j) out[(long)((n * K + k) * outHeight + y * 2 + i) * outWidth + x * 2 + j] = temp_out[i * 2 + j]; } } } }写出最终正确的带吗，并且用中文解释

Solution

下面是修改后的代码，保证U矩阵的存储方式改变后，U矩阵的读取方式也相应改变，同时保持最终计算结果V矩阵和U矩阵不变。

cpp
for (int k = 0; k < K; ++k) {
    for (int c = 0; c < C; ++c) {
        float *filters_ptr = filter + (k * C + c) * sizeF;
        sgemm(&G[0][0], filters_ptr, tmp_u, 4, 3, 3);
        sgemm(tmp_u, &G_T[0][0], u, 4, 3, 4);
        for (int xi = 0; xi < 4; ++xi) {
            int base_index = ((xi * K) + k) * C * 4 + c * 4; // 修改后的存储方式
            memcpy(&U[base_index], &u[xi * 4], 4 * sizeof(float));
        }
    }
}

float tmp_v[16];
float d[16];  // d: [4 * 4];
float v[16];  // v: [4 * 4];
#pragma omp parallel for collapse(2) private(tmp_v, d, v)
for (int n = 0; n < N; ++n)
    for (int c = 0; c < C; ++c) {
        for (int y = 0; y < outHeight / 2; ++y) {
            for (int x = 0; x < outWidth / 2; ++x) {

                // 生成d_cb
                for (int iy = 0; iy < 4; ++iy)
                    for (int ix = 0; ix < 4; ++ix)
                        d[iy * 4 + ix] = image[(n * C + c) * sizeI + (y * 2 + iy) * inWidth + (x * 2 + ix)];
                sgemm(&B_T[0][0], d, tmp_v, 4, 4, 4);
                sgemm(tmp_v, &B[0][0], v, 4, 4, 4);
                int b = ((n * outHeight / 2) + y) * outWidth / 2 + x;
                for (int xi = 0; xi < 4; ++xi)
                    for (int nu = 0; nu < 4; ++nu)
                        V[((long)(xi * 4 + nu) * C + c) * P + b] = v[xi * 4 + nu];
            }
        }
    }

// M[xi, nu, :, :] = U[xi, nu, :, :] * V[xi, nu, :, :]
for (int xi = 0; xi < 4; ++xi) {
    for (int nu = 0; nu < 4; ++nu) {
        float *M_ptr = M + (long)(xi * 4 + nu) * K * P;
        float *U_ptr = U + (long)(xi * K) * C * 4 + nu * 4; // 修改后的读取方式
        float *V_ptr = V + (long)(xi * 4 + nu) * C * P;
        sgemm_parallel(U_ptr, V_ptr, M_ptr, K, C, P);
    }
}

// Y = A_T * m * A
float mm[16];       // 4 * 4
float tmp_m[8];     // 2 * 4
float temp_out[4];  // 2 * 2
for (int n = 0; n < N; ++n)
    for (int k = 0; k < K; ++k) {
        for (int y = 0; y < outHeight / 2; ++y) {
            for (int x = 0; x < outWidth / 2; ++x) {
                int b = (n * outHeight / 2 + y) * outWidth / 2 + x;
                for (long xi = 0; xi < 4; ++xi) {
                    for (long nu = 0; nu < 4; ++nu) {
                        mm[xi * 4 + nu] = M[((xi * 4 + nu) * K + k) * P + b];
                    }
                }
                sgemm(&A_T[0][0], mm, tmp_m, 2, 4, 4);
                sgemm(tmp_m, &A[0][0], temp_out, 2, 4, 2);
                for (int i = 0; i < 2; ++i)
                    for (int j = 0; j < 2; ++j)
                        out[(long)((n * K + k) * outHeight + y * 2 + i) * outWidth + x * 2 + j] = temp_out[i * 2 + j];
            }
        }
    }
}

代码解释

U矩阵的存储和读取方式改变:
- U矩阵存储方式改变后，索引方式也相应调整。原始代码中是按照((xi * 4) * K + k) * C + c的索引方式，现在改为((xi * K) + k) * C * 4 + c * 4存储。
- 在读取时，相应的索引方式也调整为(xi * K) * C * 4 + nu * 4。
矩阵操作保持不变:
- 通过这种方式的改变，V矩阵和U矩阵的最终计算结果保持不变。

如果有任何疑问或需要更详细的解释，请告诉我！

你可能会感兴趣的问题：

为什么改变U矩阵的存储方式？
SGEMM函数是如何优化矩阵乘法的？
OpenMP在代码中的作用是什么？
如何理解矩阵的索引方式？
修改后的代码对性能有什么影响？

提示：

理解和优化矩阵操作对数值计算和机器学习的性能至关重要。

Ask a new question for Free

By Image

Drop file here or Click Here to upload

Math Problem Analysis

Mathematical Concepts

Linear Algebra
Matrix Multiplication
Parallel Computing

Formulas

SGEMM algorithm for matrix multiplication

Theorems

Suitable Grade Level

Advanced Mathematics

Related Recommendation

Advanced Matrix Operations and SGEMM Functions Explained

Matrix Multiplication and Quadratic Solutions – Step-by-Step Guide

Matrix Operations: Determinants, Row Reduction, and Eigenvalues for a 4x4 Matrix

Matrix Multiplication: Converting to BF16 Format in C

Matrix Operations: Addition, Multiplication, and Determinants