首页 > 其他分享 >bupt_os_lab2

bupt_os_lab2

时间:2024-11-18 14:40:53浏览次数:1  
标签:end thread int data bupt lab2 os block size

easy_lab2

作业地址:https://github.com/BUPT-OS/easy_lab/tree/lab2

image-20241118145114621

以下均为multiply.cpp代码

22分多线程,分块,调整计算顺序

#include "multiply.h"
#include <emmintrin.h>   // SSE2
#include <pmmintrin.h>   // SSE3
#include <thread>
#include <vector>

// 定义块大小和线程数量
#define BLOCK_SIZE 32

void block_multiply(double matrix1[N][M], double matrix2[M][P], double result_matrix[N][P], 
                    int i_start, int i_end, int j_start, int j_end) 
{
    for (int ii = i_start; ii < i_end; ii += BLOCK_SIZE) {
        for (int jj = j_start; jj < j_end; jj += BLOCK_SIZE) {
            for (int kk = 0; kk < M; kk += BLOCK_SIZE) {

                // 块内循环
                for (int i = ii; i < ii + BLOCK_SIZE && i < i_end; i++) {
                    for (int j = jj; j < jj + BLOCK_SIZE && j < j_end; j++) {
                        __m128d sum = _mm_setzero_pd();  // 初始化 SIMD 累加器

                        // 使用循环展开和 SIMD 加速
                        int k;
                        for (k = kk; k <= kk + BLOCK_SIZE - 2 && k < M; k += 2) {
                            __m128d m1 = _mm_loadu_pd(&matrix1[i][k]);
                            __m128d m2 = _mm_set_pd(matrix2[k+1][j], matrix2[k][j]);
                            sum = _mm_add_pd(sum, _mm_mul_pd(m1, m2));
                        }

                        double temp[2];
                        _mm_storeu_pd(temp, sum);
                        result_matrix[i][j] += temp[0] + temp[1];

                        // 处理剩余的元素
                        for (; k < kk + BLOCK_SIZE && k < M; k++) {
                            result_matrix[i][j] += matrix1[i][k] * matrix2[k][j];
                        }
                    }
                }
            }
        }
    }
}

void matrix_multiplication(double matrix1[N][M], double matrix2[M][P], double result_matrix[N][P]) 
{
    // 初始化结果矩阵
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < P; j++) {
            result_matrix[i][j] = 0.0;
        }
    }

    // 确定线程数量
    int num_threads = std::thread::hardware_concurrency();
    std::vector<std::thread> threads;
    int rows_per_thread = N / num_threads;

    // 创建线程池
    for (int t = 0; t < num_threads; t++) {
        int i_start = t * rows_per_thread;
        int i_end = (t == num_threads - 1) ? N : (t + 1) * rows_per_thread;

        // 每个线程处理矩阵的一个子块
        threads.emplace_back(block_multiply, matrix1, matrix2, result_matrix, i_start, i_end, 0, P);
    }

    // 等待所有线程完成
    for (auto &th : threads) {
        th.join();
    }
}


36分

#include "multiply.h"
#include <cstdio>
#include <thread>
#include <immintrin.h>
#include <pmmintrin.h>
#include <algorithm>
#include <pthread.h>

#define L1_BLOCK_SIZE 64
#define L2_BLOCK_SIZE 128
#define L3_BLOCK_SIZE 256

typedef struct {
    int thread_id;
    double (*matrix1)[M];
    double (*matrix2)[P];
    double (*result_matrix)[P];
    int block_size;
} ThreadData;

void block_matrix_multiply(double (*matrix1)[M], double (*matrix2)[P], double (*result_matrix)[P],
                           int row_start, int row_end, int col_start, int col_end, int mid_start, int mid_end, int block_size) {
    for (int i = row_start; i < row_end; i++) {
        double *temp_rm = result_matrix[i];
        for (int k = mid_start; k < mid_end; k++) {
            double temp_m1 = matrix1[i][k];
            double *temp_m2 = matrix2[k];

            int j = col_start;
            for (; j + 7 < col_end; j += 8) {
                __m512d m1 = _mm512_set1_pd(temp_m1);
                __m512d m2 = _mm512_loadu_pd(temp_m2 + j);
                __m512d rm = _mm512_loadu_pd(temp_rm + j);
                __m512d res = _mm512_fmadd_pd(m1, m2, rm); // FMA 优化
                _mm512_storeu_pd(temp_rm + j, res);
            }
            for (; j < col_end; j++) {
                temp_rm[j] += temp_m1 * temp_m2[j];
            }
        }
    }
}

void* thread_matrix_multiply(void* arg) {
    ThreadData* data = (ThreadData*)arg;
    int NUM_THREADS = std::thread::hardware_concurrency();
    int rows_per_thread = (N + NUM_THREADS - 1) / NUM_THREADS;
    int start_row = data->thread_id * rows_per_thread;
    int end_row = std::min(start_row + rows_per_thread, N);

    for (int i = start_row; i < end_row; i += data->block_size) {
        int block_i_end = std::min(i + data->block_size, end_row);
        for (int j = 0; j < P; j += data->block_size) {
            int block_j_end = std::min(j + data->block_size, P);
            for (int k = 0; k < M; k += data->block_size) {
                int block_k_end = std::min(k + data->block_size, M);
                block_matrix_multiply(data->matrix1, data->matrix2, data->result_matrix,
                                      i, block_i_end, j, block_j_end, k, block_k_end, data->block_size);
            }
        }
    }

    pthread_exit(NULL);
}

void matrix_multiplication(double matrix1[N][M], double matrix2[M][P], double result_matrix[N][P]) {
    // 初始化结果矩阵
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < P; ++j) {
            result_matrix[i][j] = 0.0;
        }
    }

    int NUM_THREADS = std::thread::hardware_concurrency();
    pthread_t threads[NUM_THREADS];
    ThreadData thread_data[NUM_THREADS];

    // 动态选择分块大小
    int block_size;
    if (N >= 2048) {
        block_size = L3_BLOCK_SIZE;
    } else if (N >= 1024) {
        block_size = L2_BLOCK_SIZE;
    } else {
        block_size = L1_BLOCK_SIZE;
    }

    for (int i = 0; i < NUM_THREADS; i++) {
        thread_data[i].thread_id = i;
        thread_data[i].matrix1 = matrix1;
        thread_data[i].matrix2 = matrix2;
        thread_data[i].result_matrix = result_matrix;
        thread_data[i].block_size = block_size;
        pthread_create(&threads[i], NULL, thread_matrix_multiply, (void*)&thread_data[i]);
    }

    for (int i = 0; i < NUM_THREADS; i++) {
        pthread_join(threads[i], NULL);
    }
}

70分:加上o3优化,会快非常多

#include "multiply.h"
#include <cstdio>
#include <thread>
#include <immintrin.h>
#include <pmmintrin.h>
#include <algorithm>
#include <pthread.h>
#pragma GCC optimize("O3")


#define L1_BLOCK_SIZE 64
#define L2_BLOCK_SIZE 128
#define L3_BLOCK_SIZE 256

typedef struct {
    int thread_id;
    double (*matrix1)[M];
    double (*matrix2)[P];
    double (*result_matrix)[P];
    int block_size;
} ThreadData;

void block_matrix_multiply(double (*matrix1)[M], double (*matrix2)[P], double (*result_matrix)[P],
                           int row_start, int row_end, int col_start, int col_end, int mid_start, int mid_end, int block_size) {
    for (int i = row_start; i < row_end; i++) {
        double *temp_rm = result_matrix[i];
        for (int k = mid_start; k < mid_end; k++) {
            double temp_m1 = matrix1[i][k];
            double *temp_m2 = matrix2[k];

            int j = col_start;
            for (; j + 7 < col_end; j += 8) {
                __m512d m1 = _mm512_set1_pd(temp_m1);
                __m512d m2 = _mm512_loadu_pd(temp_m2 + j);
                __m512d rm = _mm512_loadu_pd(temp_rm + j);
                __m512d res = _mm512_fmadd_pd(m1, m2, rm); // FMA 优化
                _mm512_storeu_pd(temp_rm + j, res);
            }
            for (; j < col_end; j++) {
                temp_rm[j] += temp_m1 * temp_m2[j];
            }
        }
    }
}

void* thread_matrix_multiply(void* arg) {
    ThreadData* data = (ThreadData*)arg;
    int NUM_THREADS = std::thread::hardware_concurrency();
    int rows_per_thread = (N + NUM_THREADS - 1) / NUM_THREADS;
    int start_row = data->thread_id * rows_per_thread;
    int end_row = std::min(start_row + rows_per_thread, N);

    for (int i = start_row; i < end_row; i += data->block_size) {
        int block_i_end = std::min(i + data->block_size, end_row);
        for (int j = 0; j < P; j += data->block_size) {
            int block_j_end = std::min(j + data->block_size, P);
            for (int k = 0; k < M; k += data->block_size) {
                int block_k_end = std::min(k + data->block_size, M);
                block_matrix_multiply(data->matrix1, data->matrix2, data->result_matrix,
                                      i, block_i_end, j, block_j_end, k, block_k_end, data->block_size);
            }
        }
    }

    pthread_exit(NULL);
}

void matrix_multiplication(double matrix1[N][M], double matrix2[M][P], double result_matrix[N][P]) {
    // 初始化结果矩阵
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < P; ++j) {
            result_matrix[i][j] = 0.0;
        }
    }

    int NUM_THREADS = std::thread::hardware_concurrency();
    pthread_t threads[NUM_THREADS];
    ThreadData thread_data[NUM_THREADS];

    // 动态选择分块大小
    int block_size;
    if (N >= 2048) {
        block_size = L3_BLOCK_SIZE;
    } else if (N >= 1024) {
        block_size = L2_BLOCK_SIZE;
    } else {
        block_size = L1_BLOCK_SIZE;
    }

    for (int i = 0; i < NUM_THREADS; i++) {
        thread_data[i].thread_id = i;
        thread_data[i].matrix1 = matrix1;
        thread_data[i].matrix2 = matrix2;
        thread_data[i].result_matrix = result_matrix;
        thread_data[i].block_size = block_size;
        pthread_create(&threads[i], NULL, thread_matrix_multiply, (void*)&thread_data[i]);
    }

    for (int i = 0; i < NUM_THREADS; i++) {
        pthread_join(threads[i], NULL);
    }
}

标签:end,thread,int,data,bupt,lab2,os,block,size
From: https://www.cnblogs.com/vastjoy/p/18552651

相关文章

  • OSG开发笔记(三十二):深入理解相机视口、制作支持与主视图同步变换旋转的相机HUD
    前言  深入理解相机视口,摸索相机视口旋转功能,背景透明或者不透明。  本篇,实现了一个左下角旋转HUD且背景透明的相机视口。 Demo           HUD相机的坐标  抬头HUD就是通过投影矩阵来实现,具体可参看《OSG开发笔记(二十):OSG使用HUD......
  • Charles抓取不到浏览器请求(macOS)解决方案
    一.Charlescharles抓包工具,通常使用默认的端口号:http:8888https:8443在安装好Charles后,默认情况下http代理服务器运行在8888端口,https服务器运行在8443端口二.解决方案:1.先把证书安装上下载证书后,打开macOS-【钥匙串访问】,选择证书,右键【显示简介】--【信任】中勾选始终信......
  • toolchain build host target
    一个程序开发完成以后,对源代码进行编译,将编译后的文件发布出去形成所谓的各个平台的安装版本(非开源的都是这么干的,开源的也可以这样编译后不同运行平台的编译版本),这就是所谓的交叉编译。下面介绍有关这平台相关性的选项。  --build=BUILD  configureforbuildingonBU......
  • macos安装Charles
    一.安装jdk1.地址:https://www.oracle.com/cn/java/technologies/downloads/#java11,下载后双击安装2.查看java的安装地址,在macOS终端输入/usr/libexec/java_home,终端会显示出Java的安装路径。这个路径通常是“/Library/Java/JavaVirtualMachines/”后面跟着Java的版本号。3.配置......
  • 【FAQ】HarmonyOS SDK 闭源开放能力 —Share Kit
    1.问题描述:使用系统分享组件分享本地文件,点击分享菜单下方的“另存为”将要分享的文件分享至系统文件管理中,在文件管理中查看分享进来的文件为0B。尝试了3种uri的写法都不行,代码如下:consturi=getContext().getApplicationContext().filesDir+'/xxx.json'consturi1='fi......
  • vllm docker-compose
    version:'3'services:moonlit-vllm-qwen-hotvideo:image:vllm/vllm-openai:v0.6.2container_name:hitvideos_apirestart:alwayscommand:["--served-model-name","qwen2.5-14b-hitvideos","......
  • Permission denied (publickey). fatal: Could not read from remote repository.
    将本地代码推送到远程分支报错:Permissiondenied(publickey).fatal:Couldnotreadfromremoterepository.确保已经添加了正确的 SSH密钥。可以使用以下命令检查SSH密钥是否已经添加:[email protected]如果看到消息“Hi[username]!You'vesuccessfullyauthen......
  • HarmonyOS Next 对称密钥生成全攻略:从原理到实践
    本文旨在深入探讨华为鸿蒙HarmonyOSNext系统(截止目前API12)的技术细节,基于实际开发实践进行总结。主要作为技术分享与交流载体,难免错漏,欢迎各位同仁提出宝贵意见和问题,以便共同进步。本文为原创内容,任何形式的转载必须注明出处及原作者。一、引言在数据安全的浩瀚领域中,对称......
  • HarmonyOS Next 加解密算法框架入门:基础概念与功能概述
    本文旨在深入探讨华为鸿蒙HarmonyOSNext系统(截止目前API12)的技术细节,基于实际开发实践进行总结。主要作为技术分享与交流载体,难免错漏,欢迎各位同仁提出宝贵意见和问题,以便共同进步。本文为原创内容,任何形式的转载必须注明出处及原作者。一、引言在当今数字化时代,信息安全犹......
  • HarmonyOS Next 椭圆曲线密码学应用:ECC 与 SM2 深入剖析
    本文旨在深入探讨华为鸿蒙HarmonyOSNext系统(截止目前API12)的技术细节,基于实际开发实践进行总结。主要作为技术分享与交流载体,难免错漏,欢迎各位同仁提出宝贵意见和问题,以便共同进步。本文为原创内容,任何形式的转载必须注明出处及原作者。一、引言在现代密码学领域,椭圆曲线密......