Math Problem Statement
for (int k = 0; k < K; ++k) { for (int c = 0; c < C; ++c) { float *filters_ptr = filter + (k * C + c) * sizeF; sgemm(&G[0][0], filters_ptr, tmp_u, 4, 3, 3); sgemm(tmp_u, &G_T[0][0], u, 4, 3, 4); for (int xi = 0; xi < 4; ++xi) { int base_index = ((xi * 4) * K + k) * C + c; memcpy(&U[base_index], &u[xi * 4], 4 * sizeof(float)); } } }我们将U矩阵的存储方式改变,而后U矩阵的读取方式也要相应的改变最后V矩阵和U矩阵的计算结果要保持不变float tmp_v[16]; float d[16]; // d: [4 * 4]; float v[16]; // v: [4 * 4]; #pragma omp parallel for collapse(2) private(tmp_v, d, v) for (int n = 0; n < N; ++n) for (int c = 0; c < C; ++c) { for (int y = 0; y < outHeight / 2; ++y) { for (int x = 0; x < outWidth / 2; ++x) {
// Generate d_cb for (int iy = 0; iy < 4; ++iy) for (int ix = 0; ix < 4; ++ix) d[iy * 4 + ix] = image[(n * C + c) * sizeI + (y * 2 + iy) * inWidth + (x * 2 + ix)]; sgemm(&B_T[0][0], d, tmp_v, 4, 4, 4); sgemm(tmp_v, &B[0][0], v, 4, 4, 4); int b = ((n * outHeight / 2) + y) * outWidth / 2 + x; for (int xi = 0; xi < 4; ++xi) for (int nu = 0; nu < 4; ++nu) V[((long)(xi * 4 + nu) * C + c) * P + b] = v[xi * 4 + nu]; } } }
// M[xi, nu, :, :] = U[xi, nu, :, :] * V[xi, nu, :, :] for (int xi = 0; xi < 4; ++xi) { for (int nu = 0; nu < 4; ++nu) { float *M_ptr = M + (long)(xi * 4 + nu) * K * P; float *U_ptr = U + (long)(xi * 4 + nu) * K * C; float *V_ptr = V + (long)(xi * 4 + nu) * C * P; sgemm_parallel(U_ptr, V_ptr, M_ptr, K, C, P); } }
// Y = A_T * m * A float mm[16]; // 4 * 4 float tmp_m[8]; // 2 * 4 float temp_out[4]; // 2 * 2 for (int n = 0; n < N; ++n) for (int k = 0; k < K; ++k) { for (int y = 0; y < outHeight / 2; ++y) { for (int x = 0; x < outWidth / 2; ++x) { int b = (n * outHeight / 2 + y) * outWidth / 2 + x; for (long xi = 0; xi < 4; ++xi) { for (long nu = 0; nu < 4; ++nu) { mm[xi * 4 + nu] = M[((xi * 4 + nu) * K + k) * P + b]; } } sgemm(&A_T[0][0], mm, tmp_m, 2, 4, 4); sgemm(tmp_m, &A[0][0], temp_out, 2, 4, 2); for (int i = 0; i < 2; ++i) for (int j = 0; j < 2; ++j) out[(long)((n * K + k) * outHeight + y * 2 + i) * outWidth + x * 2 + j] = temp_out[i * 2 + j]; } } } }写出最终正确的带吗,并且用中文解释
Solution
Ask a new question for Free
By Image
Drop file here or Click Here to upload
Math Problem Analysis
Mathematical Concepts
Linear Algebra
Matrix Multiplication
Parallel Computing
Formulas
SGEMM algorithm for matrix multiplication
Theorems
-
Suitable Grade Level
Advanced Mathematics
Related Recommendation
Advanced Matrix Operations and SGEMM Functions Explained
Matrix Operations: Determinants, Row Reduction, and Eigenvalues for a 4x4 Matrix
Matrix Operations: Addition, Multiplication, and Determinants
Matrix Operations: Working with Two Matrices - Addition, Multiplication, Determinants, and Inverses
Step-by-Step Guide to Proving Matrix Exercises