c++ - OpenMp 并行-6ren

c++ - Song song OpenMp

In lại 作者：搜寻专家更新时间：2023-10-31 02:11:04

我有以下称为 pgain 的方法，它调用我试图并行化的方法 dist:

/******************************************************************************/

/* For a given point x, find the cost of the following operation:
 * -- open a facility at x if there isn't already one there,
 * -- for points y such that the assignment distance of y exceeds dist(y, x),
 * make y a member of x,
 * -- for facilities y such that reassigning y and all its members to x
 * would save cost, realize this closing and reassignment.
 *
 * If the cost of this operation is negative (i.e., if this entire operation
 * saves cost), perform this operation and return the amount of cost saved;
 * otherwise, do nothing.
 */

/* numcenters will be updated to reflect the new number of centers */
/* z is the facility cost, x is the number of this point in the array
   points */
double pgain ( long x, Points *points, double z, long int *numcenters )
{
    số nguyên i;
    int number_of_centers_to_close = 0;

    static double *work_mem;
    static double gl_cost_of_opening_x;
    static int gl_number_of_centers_to_close;

    int stride = *numcenters + 2;
    //make stride a multiple of CACHE_LINE
    int cl = CACHE_LINE/sizeof ( double );
    if ( stride % cl != 0 ) {
        stride = cl * ( stride / cl + 1 );
    }
    int K = stride - 2 ; // K==*numcenters

    //my own cost of opening x
    double cost_of_opening_x = 0;

    work_mem = ( double* ) malloc ( 2 * stride * sizeof ( double ) );
    gl_cost_of_opening_x = 0;
    gl_number_of_centers_to_close = 0;

    /*
     * For each center, we have a *lower* field that indicates
     * how much we will save by closing the center.
     */
    int đếm = 0;
    for ( int i = 0; i < points->num; i++ ) {
        if ( is_center[i] ) {
            center_table[i] = count++;
        }
    }
    work_mem[0] = 0;

    //now we finish building the table. clear the working memory.
    memset ( switch_membership, 0, points->num * sizeof ( bool ) );
    memset ( work_mem, 0, stride*sizeof ( double ) );
    memset ( work_mem+stride,0,stride*sizeof ( double ) );

    //my *lower* fields
    double* lower = &work_mem[0];
    //global *lower* fields
    double* gl_lower = &work_mem[stride];

    #pragma omp parallel for
    for ( i = 0; i < points->num; i++ ) {
        float x_cost = dist ( points->p[i], points->p[x], points->dim ) * points->p[i].weight;
        float current_cost = points->p[i].cost;

        if ( x_cost < current_cost ) {

            // point i would save cost just by switching to x
            // (note that i cannot be a median,
            // or else dist(p[i], p[x]) would be 0)

            switch_membership[i] = 1;
            cost_of_opening_x += x_cost - current_cost;

        } khác {

            // cost of assigning i to x is at least current assignment cost of i

            // consider the savings that i's **current** median would realize
            // if we reassigned that median and all its members to x;
            // note we've already accounted for the fact that the median
            // would save z by closing; now we have to subtract from the savings
            // the extra cost of reassigning that median and its members
            int assign = points->p[i].assign;
            lower[center_table[assign]] += current_cost - x_cost;
        }
    }

    // at this time, we can calculate the cost of opening a center
    // at x; if it is negative, we'll go through with opening it

    for ( int i = 0; i < points->num; i++ ) {
        if ( is_center[i] ) {
            double low = z + work_mem[center_table[i]];
            gl_lower[center_table[i]] = low;
            if ( low > 0 ) {
                // i is a median, and
                // if we were to open x (which we still may not) we'd close i

                // note, we'll ignore the following quantity unless we do open x
                ++number_of_centers_to_close;
                cost_of_opening_x -= low;
            }
        }
    }
    //use the rest of working memory to store the following
    work_mem[K] = number_of_centers_to_close;
    work_mem[K+1] = cost_of_opening_x;

    gl_number_of_centers_to_close = ( int ) work_mem[K];
    gl_cost_of_opening_x = z + work_mem[K+1];

    // Now, check whether opening x would save cost; if so, do it, and
    // otherwise do nothing

    if ( gl_cost_of_opening_x < 0 ) {
        // we'd save money by opening x; we'll do it
        for ( int i = 0; i < points->num; i++ ) {
            bool close_center = gl_lower[center_table[points->p[i].assign]] > 0 ;
            if ( switch_membership[i] || close_center ) {
                // Either i's median (which may be i itself) is closing,
                // or i is closer to x than to its current median
                points->p[i].cost = points->p[i].weight * dist ( points->p[i], points->p[x], points->dim );
                points->p[i].assign = x;
            }
        }
        for ( int i = 0; i < points->num; i++ ) {
            if ( is_center[i] && gl_lower[center_table[i]] > 0 ) {
                is_center[i] = false;
            }
        }
        if ( x >= 0 && x < points->num ) {
            is_center[x] = true;
        }

        *numcenters = *numcenters + 1 - gl_number_of_centers_to_close;
    } khác {
        gl_cost_of_opening_x = 0; // the value we'll return
    }

    free ( work_mem );

    return -gl_cost_of_opening_x;
}

我试图并行化的函数:

/* compute Euclidean distance squared between two points */
float dist ( Point p1, Point p2, int dim )
{
    float result=0.0;
    #pragma omp parallel for reduction(+:result)
    for (int i=0; i
        result += ( p1.coord[i] - p2.coord[i] ) * ( p1.coord[i] - p2.coord[i] );
    }   
    return ( result );
}

重点是:

/* this structure represents a point */
/* these will be passed around to avoid copying coordinates */
Kiểu định nghĩa cấu trúc {
    float weight;
    float *coord;
    long assign; /* number of point where this one is assigned */
    float cost; /* cost of that assignment, weight*distance */
} Point;

我有一个大型的 streamcluster 应用程序(815 行代码)，它生成实时数字并以特定方式对它们进行排序。我在 Linux 上使用过 scalasca 工具，所以我可以测量占用大部分时间的方法，我发现上面列出的方法 dist 是最耗时的。我正在尝试使用 openMP 工具，但并行代码运行的时间比串行代码的运行时间长。如果串行代码在 1.5 秒内运行，则并行化需要 20，但结果是相同的。我想知道是不是由于某种原因我无法并行化这部分代码，或者我没有正确执行。我试图在调用树中对其进行并行化的方法:main->pkmedian->pFL->pgain->dist(-> 表示调用以下方法)

câu trả lời hay nhất

您选择并行化的代码:

float result=0.0;
#pragma omp parallel for reduction(+:result)
for (int i=0; i
    result += ( p1.coord[i] - p2.coord[i] ) * ( p1.coord[i] - p2.coord[i] );
}

不太适合从并行化中获益。您不应在此处使用 parallel for。您可能不应该在内循环上使用并行化。如果您可以并行化一些外部循环，您会更愿意看到 yield 。

协调线程组启动并行区域会产生开销，之后执行缩减也会产生开销。同时，并行区域的内容基本上不需要运行时间。鉴于此，您需要将 dim 设置得非常大，然后才能期望这会带来性能优势。

为了更形象地表达这一点，请考虑您正在进行的数学运算将花费纳秒并将其与显示各种 OpenMP 指令的开销的图表进行比较。

如果您需要它运行得更快，您的第一站应该是使用适当的编译标志，然后查看 SIMD 操作:SSE 和 AVX 是很好的关键字。您的编译器甚至可能会自动调用它们。

我构建了一些测试代码(见下文)并在启用各种优化的情况下编译它，如下所列，并在 100,000 个元素的数组上运行它。请注意，启用 -O3 会产生与 OpenMP 指令顺序相同的运行时。这意味着在考虑使用 OpenMP 之前，您需要大约 400,000 个数组，为了安全起见，可能更接近 1,000,000 个。

没有优化。运行时间约为 1900 微秒。
-O3:启用许多优化。运行时间约为 200 微秒。
-ffast-math:你想要这个，除非你正在做一些非常棘手的事情。运行时间大致相同。
-march=native:编译代码以使用 CPU 的全部功能，而不是可以在许多 CPU 上运行的通用指令集。运行时间约为 100 微秒。

我们开始吧，战略性地使用编译器选项 (-march=native) 可以使相关代码的速度加倍，而无需处理并行问题。

Đây是一个方便的幻灯片演示文稿，其中包含一些解释如何以高性能方式使用 OpenMP 的提示。

Mã kiểm tra:

#include 
#include 
#include 
#include 

int chính(){
  std::vector a;
  std::vector b;
  for(int i=0;i<100000;i++){
    a.push_back(rand()/(double)RAND_MAX);
    b.push_back(rand()/(double)RAND_MAX);
  }

  std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();

  float result = 0.0;
  //#pragma omp parallel for reduction(+:result)
  for (unsigned int i=0; i
    result += ( a[i] - b[i] ) * ( a[i] - b[i] );

  std::chrono::steady_clock::time_point end= std::chrono::steady_clock::now();

  std::cout << "Time difference = " << std::chrono::duration_cast(end - begin).count() << " microseconds"<<>
}

关于c++ - OpenMp 并行，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/44097890/

Đề xuất bài viết: c++ - 具有匿名结构的不同位映射

Đề xuất bài viết: c++ - 扫描位数组以获取多位模式

Đề xuất bài viết: c++ - 基于 ZeroMQ Session 的请求调度或条件路由

Đề xuất bài viết: C++ 对象内存管理

openmp - OpenMP 中的高斯消除
OpenMP 中的高斯消除。我是 openmp 的新手，想知道我是否在正确的地方使用了我的编译指示和屏障。我的 x 值每次都不同。他们应该是一样的吗？？ #include int num; doub
openmp - OpenMP 和矢量化之间的比较
给定一个示例函数(示例在下面给出)，for 循环可以使用 OpenMP 并行化或使用矢量化进行矢量化(假设编译器执行矢量化)。示例 void function(float* a, float* b,
openmp - OpenMP 中的原子性和关键性有什么区别？
OpenMP 中原子和关键之间有什么区别？我能做到 #pragma omp atomic g_qCount++; 但这和不一样吗 #pragma omp critical g_qCount++; ？
openmp - 给定依赖图生成 OpenMP 代码
我有一个关于如何在您考虑特定依赖关系图时生成 OpenMP 伪代码的问题。所以假设我们有这个特定的图表: 解决方案可能是这样的: #pragma omp parallel {
openmp - 使用 OpenMP 进行缩减
我正在尝试使用 openmp 计算二维矩阵的平均值。这个二维矩阵实际上是一个图像。我正在对数据进行线程分割。例如，如果我有 N线程比我处理行/N thread0 的行数，等等。我的问题是:我可以
openmp - 如何统计测量程序中的 OpenMP 性能？
我想统计测量与 OpenMP 并行化的程序的性能。我选择在执行并行算法的测试应用程序中编写循环 MAX_EXPERIMENTS次并将时间测量报告到文件中。问题解决方案似乎比提取外部循环上方的并行编译
openmp - OpenMP 中的循环顺序折叠性能建议
我找到了 Intel's performance suggestion on Xeon Phi关于 OpenMP 中的 Collapse 子句。 #pragma omp parallel for co
openmp - 如何使用 OpenMP 并行化数组移位？
如何使用 OpenMP 并行化数组移位？我尝试了一些方法，但在以下示例中没有得到任何准确的结果(该示例旋转 Carteira 对象数组的元素，用于排列算法): void rotaciona(int
openmp - 使用 openmp 并行执行函数
我有一系列对几个独立函数的调用。 func1(arg); func2(arg); func3(arg); 我想并行执行它们，而不是串行执行它们。我目前正在使用 #pragma omp parallel
openmp - openmp 中的 Dependent 子句不尊重声明的依赖
我正在尝试使用 openmp 任务来安排基本 jacobi2d 计算的平铺执行。在 jacobi2d 中，依赖于 A(i,j) 从 A(i, j) A(i-1, j) A(i+1, j) A(i, j
openmp - 在 OpenMP 中，如何让每个内核运行一个线程？
我在 3 天前开始使用 OpenMP。我想知道如何使用#pragma使每个内核运行一个线程。详细信息:- int ncores = omp_get_num_procs();for(i = 0; i <
openmp - OpenMP 中的 Schedule 子句
我有一段代码(它是应用程序的一部分)，我正在尝试使用 OpenMP 对其进行优化，正在尝试各种调度策略。就我而言，我注意到 schedule(RUNTIME)条款比其他条款有优势(我没有指定 chun
openmp - OpenMP 或 MPI 哪个更容易学习和调试？
我有一个数字运算 C/C++ 应用程序。它基本上是不同数据集的主循环。我们可以使用 openmp 和 mpi 访问一个 100 节点的集群。我想加速应用程序，但我是 mpi 和 openmp 的绝对新
openmp - OpenMP 分发中的 SECTIONS 指令如何工作？
在 OpenMP 中使用ompsections时，线程会被分配到sections内的 block ，还是每个线程会被分配到每个section？当nthreads == 3时: #pragma omp
openmp - cython openmp 单，屏障
我正在尝试在 cython 中使用 openmp。我需要在 cython 中做两件事: i) 在我的 cython 代码中使用 #pragma omp single{} 作用域。 ii) 使用#pra
openmp - 为什么 OpenMP 不能在 for 循环内有部分？
我正在尝试通过将循环的每次迭代作为 OpenMP 部分来并行化 OpenMP 中基于范围的 for 循环。我想这样做: #pragma omp parallel sections { for ( au
openmp - cython openmp 单，屏障
我正在尝试在 cython 中使用 openmp。我需要在 cython 中做两件事: i) 在我的 cython 代码中使用 #pragma omp single{} 作用域。 ii) 使用#pra
openmp - 将并行程序转换为集群程序。从 OpenMP 到？
我想编写一个代码转换器，它采用基于 OpenMP 的并行程序并在集群上运行它。我该如何解决这个问题？我使用哪些库？如何为此设置小型集群？我发现很难在 Internet 上找到有关集群计算的好 Ma
c++ - OpenMP - OpenMP 'for' 语句中的索引变量必须具有带符号的整数类型
我是 OpenMP 的新手。我正在尝试为 for 循环使用多个内核，但出现此编译错误: “错误 C3016:'x':OpenMP 'for' 语句中的索引变量必须具有带符号的整数类型”。我知道 Op
openmp - 使用 Qt creator 时如何开启 OpenMP
如果我使用 VS 2010 编译器从 Qt Creator 构建项目，我如何启用 OpenMP(从 Visual Studio 构建时，您只需启用该功能)谢谢最佳答案在 .pro 文件中尝试下一步

搜寻专家

Hồ sơ

Tôi là một lập trình viên xuất sắc, rất giỏi!

Bài viết phổ biến của tác giả

Nhận phiếu giảm giá taxi Didi miễn phí

Các bài viết phổ biến trên toàn bộ trang web

trang đầu

đã học

6Ren AI

Trung tâm mua sắm

c++ - Song song OpenMp