# c++ - 為什麼排序後的分組的分組求和比未排序的分組要慢?

## performance (1)

``````#include <cstdlib>
#include <iostream>
#include <ctime>

int main(int argc, char* argv[]) {
int num_values = atoi(argv[1]);
int num_groups = atoi(argv[2]);

int group_size = num_values / num_groups;
int group = -1;

std::srand(42);

for (int i = 0; i < num_values; ++i) {
if (i % group_size == 0) {
++group;
}
std::cout << std::rand() << '\t' << group << '\n';
}

return 0;
}``````

``````#include <iostream>
#include <chrono>
#include <vector>

// This is the function whose performance I am interested in
void grouped_sum(int* p_x, int *p_g, int n, int* p_out) {
for (size_t i = 0; i < n; ++i) {
p_out[p_g[i]] += p_x[i];
}
}

int main() {
std::vector<int> values;
std::vector<int> groups;
std::vector<int> sums;

int n_groups = 0;

// Read in the values and calculate the max number of groups
while(std::cin) {
int value, group;
std::cin >> value >> group;
values.push_back(value);
groups.push_back(group);
if (group > n_groups) {
n_groups = group;
}
}
sums.resize(n_groups);

// Time grouped sums
std::chrono::system_clock::time_point start = std::chrono::system_clock::now();
for (int i = 0; i < 10; ++i) {
grouped_sum(values.data(), groups.data(), values.size(), sums.data());
}
std::chrono::system_clock::time_point end = std::chrono::system_clock::now();

std::cout << (end - start).count() << std::endl;

return 0;
}``````

``````g++ -O3 generate_groups.cc -o generate_groups
g++ -O3 sum_groups.cc -o sum_groups
generate_groups 1000000 100 > groups
shuf groups > groups2
sum_groups < groups
sum_groups < groups2
sum_groups < groups2
sum_groups < groups
20784
8854
8220
21006``````

# 設置/使其變慢

``````sumspeed\$ time ./sum_groups < groups_shuffled
11558358

real    0m0.705s
user    0m0.692s
sys 0m0.013s

sumspeed\$ time ./sum_groups < groups_sorted
24986825

real    0m0.722s
user    0m0.711s
sys 0m0.012s``````

``````sumspeed\$ time ./sum_groups < groups_shuffled
1131838420

real    0m1.828s
user    0m1.811s
sys 0m0.016s

sumspeed\$ time ./sum_groups < groups_sorted
2494032110

real    0m3.189s
user    0m3.169s
sys 0m0.016s``````

# 性能差異

``````sumspeed\$ perf record ./sum_groups < groups_shuffled
1166805982
[ perf record: Woken up 1 times to write data ]
Warning:
Processed 4636 samples and lost 6.95% samples!

[ perf record: Captured and wrote 0.176 MB perf.data (4314 samples) ]

sumspeed\$ perf record ./sum_groups < groups_sorted
2571547832
[ perf record: Woken up 2 times to write data ]
[ perf record: Captured and wrote 0.420 MB perf.data (10775 samples) ]``````

``````sumspeed\$ perf diff
[...]
# Event 'cycles:uppp'
#
# Baseline  Delta Abs  Shared Object        Symbol
# ........  .........  ...................  ........................................................................
#
57.99%    +26.33%  sum_groups           [.] main
12.10%     -7.41%  libc-2.23.so         [.] _IO_getc
9.82%     -6.40%  libstdc++.so.6.0.21  [.] std::num_get<char, std::istreambuf_iterator<char, std::char_traits<c
6.45%     -4.00%  libc-2.23.so         [.] _IO_ungetc
2.40%     -1.32%  libc-2.23.so         [.] _IO_sputbackc
1.65%     -1.21%  libstdc++.so.6.0.21  [.] 0x00000000000dc4a4
1.57%     -1.20%  libc-2.23.so         [.] _IO_fflush
1.71%     -1.07%  libstdc++.so.6.0.21  [.] std::istream::sentry::sentry
1.22%     -0.77%  libstdc++.so.6.0.21  [.] std::istream::operator>>
0.79%     -0.47%  libstdc++.so.6.0.21  [.] __gnu_cxx::stdio_sync_filebuf<char, std::char_traits<char> >::uflow
[...]``````

`main()` 中可能有更多時間，這可能已內聯了 `grouped_sum()` 。 太好了，非常感謝。

# 性能註釋

`main()` 內部 花費的時間是否有所不同？

``````sumspeed\$ perf annotate -i perf.data.old
[...]
│     // This is the function whose performance I am interested in
│     void grouped_sum(int* p_x, int *p_g, int n, int* p_out) {
│       for (size_t i = 0; i < n; ++i) {
│180:   xor    %eax,%eax
│       test   %rdi,%rdi
│     ↓ je     1a4
│       nop
│         p_out[p_g[i]] += p_x[i];
6,88 │190:   movslq (%r9,%rax,4),%rdx
58,54 │       mov    (%r8,%rax,4),%esi
│     #include <chrono>
│     #include <vector>
│
│     // This is the function whose performance I am interested in
│     void grouped_sum(int* p_x, int *p_g, int n, int* p_out) {
│       for (size_t i = 0; i < n; ++i) {
│         p_out[p_g[i]] += p_x[i];
[...]``````

``````sumspeed\$ perf annotate -i perf.data
[...]
│     // This is the function whose performance I am interested in
│     void grouped_sum(int* p_x, int *p_g, int n, int* p_out) {
│       for (size_t i = 0; i < n; ++i) {
│180:   xor    %eax,%eax
│       test   %rdi,%rdi
│     ↓ je     1a4
│       nop
│         p_out[p_g[i]] += p_x[i];
1,00 │190:   movslq (%r9,%rax,4),%rdx
55,12 │       mov    (%r8,%rax,4),%esi
│     #include <chrono>
│     #include <vector>
│
│     // This is the function whose performance I am interested in
│     void grouped_sum(int* p_x, int *p_g, int n, int* p_out) {
│       for (size_t i = 0; i < n; ++i) {
│         p_out[p_g[i]] += p_x[i];
[...]``````

# 性能統計

``````sumspeed\$ perf stat ./sum_groups < groups_shuffled
1138880176

Performance counter stats for './sum_groups':

1826,232278      task-clock (msec)         #    0,999 CPUs utilized
72      context-switches          #    0,039 K/sec
1      cpu-migrations            #    0,001 K/sec
4 076      page-faults               #    0,002 M/sec
5 403 949 695      cycles                    #    2,959 GHz
930 473 671      stalled-cycles-frontend   #   17,22% frontend cycles idle
9 827 685 690      instructions              #    1,82  insn per cycle
#    0,09  stalled cycles per insn
2 086 725 079      branches                  # 1142,639 M/sec
2 069 655      branch-misses             #    0,10% of all branches

1,828334373 seconds time elapsed

sumspeed\$ perf stat ./sum_groups < groups_sorted
2496546045

Performance counter stats for './sum_groups':

3186,100661      task-clock (msec)         #    1,000 CPUs utilized
5      context-switches          #    0,002 K/sec
0      cpu-migrations            #    0,000 K/sec
4 079      page-faults               #    0,001 M/sec
9 424 565 623      cycles                    #    2,958 GHz
4 955 937 177      stalled-cycles-frontend   #   52,59% frontend cycles idle
9 829 009 511      instructions              #    1,04  insn per cycle
#    0,50  stalled cycles per insn
2 086 942 109      branches                  #  655,014 M/sec
2 078 204      branch-misses             #    0,10% of all branches

3,186768174 seconds time elapsed``````

# 修復它

## 多個和向量

（代碼不是很漂亮；不要判斷我，互聯網！）

``````#include <iostream>
#include <chrono>
#include <vector>

#ifndef NSUMS
#define NSUMS (4) // must be power of 2 (for masking to work)
#endif

// This is the function whose performance I am interested in
void grouped_sum(int* p_x, int *p_g, int n, int** p_out) {
for (size_t i = 0; i < n; ++i) {
p_out[i & (NSUMS-1)][p_g[i]] += p_x[i];
}
}

int main() {
std::vector<int> values;
std::vector<int> groups;
std::vector<int> sums[NSUMS];

int n_groups = 0;

// Read in the values and calculate the max number of groups
while(std::cin) {
int value, group;
std::cin >> value >> group;
values.push_back(value);
groups.push_back(group);
if (group >= n_groups) {
n_groups = group+1;
}
}
for (int i=0; i<NSUMS; ++i) {
sums[i].resize(n_groups);
}

// Time grouped sums
std::chrono::system_clock::time_point start = std::chrono::system_clock::now();
int* sumdata[NSUMS];
for (int i = 0; i < NSUMS; ++i) {
sumdata[i] = sums[i].data();
}
for (int i = 0; i < 1000; ++i) {
grouped_sum(values.data(), groups.data(), values.size(), sumdata);
}
for (int i = 1; i < NSUMS; ++i) {
for (int j = 0; j < n_groups; ++j) {
sumdata[0][j] += sumdata[i][j];
}
}
std::chrono::system_clock::time_point end = std::chrono::system_clock::now();

std::cout << (end - start).count() << " with NSUMS=" << NSUMS << std::endl;

return 0;
}``````

（哦，我還修復了n_groups的計算；它被減一了。）

### 結果

``````sumspeed\$ for n in 1 2 4 8 128; do make -s clean && make -s NSUMS=\$n && (perf stat ./sum_groups < groups_shuffled && perf stat ./sum_groups < groups_sorted)  2>&1 | egrep '^[0-9]|frontend'; done
1134557008 with NSUMS=1
924 611 882      stalled-cycles-frontend   #   17,13% frontend cycles idle
2513696351 with NSUMS=1
4 998 203 130      stalled-cycles-frontend   #   52,79% frontend cycles idle
1116188582 with NSUMS=2
899 339 154      stalled-cycles-frontend   #   16,83% frontend cycles idle
1365673326 with NSUMS=2
1 845 914 269      stalled-cycles-frontend   #   29,97% frontend cycles idle
1127172852 with NSUMS=4
902 964 410      stalled-cycles-frontend   #   16,79% frontend cycles idle
1171849032 with NSUMS=4
1 007 807 580      stalled-cycles-frontend   #   18,29% frontend cycles idle
1118732934 with NSUMS=8
881 371 176      stalled-cycles-frontend   #   16,46% frontend cycles idle
1129842892 with NSUMS=8
905 473 182      stalled-cycles-frontend   #   16,80% frontend cycles idle
1497803734 with NSUMS=128
1 982 652 954      stalled-cycles-frontend   #   30,63% frontend cycles idle
1180742299 with NSUMS=128
1 075 507 514      stalled-cycles-frontend   #   19,39% frontend cycles idle   ``````

## 寄存器中的每組總和

（這是在編輯中添加的）

``````// This is the function whose performance I am interested in
void grouped_sum(int* p_x, int *p_g, int n, int* p_out) {
int i = n-1;
while (i >= 0) {
int g = p_g[i];
int gsum = 0;
do {
gsum += p_x[i--];
} while (i >= 0 && p_g[i] == g);
p_out[g] += gsum;
}
}``````

### 結果

``````sumspeed\$ time ./sum_groups < groups_shuffled
2236354315

real    0m2.932s
user    0m2.923s
sys 0m0.009s``````

...但是比排序輸入的“許多總和”解決方案快40％。

``````sumspeed\$ time ./sum_groups < groups_sorted
809694018

real    0m1.501s
user    0m1.496s
sys 0m0.005s``````

## 多個和向量，具有偏移量而不是位掩碼

Sopel 建議了四個展開的擴展，以替代我的位掩碼方法。 我已經實施了他們建議的通用版本，可以處理不同的 `NSUMS` 。 我指望編譯器為我們展開內部循環（至少在 `NSUMS=4` 確實 `NSUMS=4` ）。

``````#include <iostream>
#include <chrono>
#include <vector>

#ifndef NSUMS
#define NSUMS (4) // must be power of 2 (for masking to work)
#endif

#ifndef INNER
#define INNER (0)
#endif
#if INNER
// This is the function whose performance I am interested in
void grouped_sum(int* p_x, int *p_g, int n, int** p_out) {
size_t i = 0;
int quadend = n & ~(NSUMS-1);
for (; i < quadend; i += NSUMS) {
for (int k=0; k<NSUMS; ++k) {
p_out[k][p_g[i+k]] += p_x[i+k];
}
}
for (; i < n; ++i) {
p_out[0][p_g[i]] += p_x[i];
}
}
#else
// This is the function whose performance I am interested in
void grouped_sum(int* p_x, int *p_g, int n, int** p_out) {
for (size_t i = 0; i < n; ++i) {
p_out[i & (NSUMS-1)][p_g[i]] += p_x[i];
}
}
#endif

int main() {
std::vector<int> values;
std::vector<int> groups;
std::vector<int> sums[NSUMS];

int n_groups = 0;

// Read in the values and calculate the max number of groups
while(std::cin) {
int value, group;
std::cin >> value >> group;
values.push_back(value);
groups.push_back(group);
if (group >= n_groups) {
n_groups = group+1;
}
}
for (int i=0; i<NSUMS; ++i) {
sums[i].resize(n_groups);
}

// Time grouped sums
std::chrono::system_clock::time_point start = std::chrono::system_clock::now();
int* sumdata[NSUMS];
for (int i = 0; i < NSUMS; ++i) {
sumdata[i] = sums[i].data();
}
for (int i = 0; i < 1000; ++i) {
grouped_sum(values.data(), groups.data(), values.size(), sumdata);
}
for (int i = 1; i < NSUMS; ++i) {
for (int j = 0; j < n_groups; ++j) {
sumdata[0][j] += sumdata[i][j];
}
}
std::chrono::system_clock::time_point end = std::chrono::system_clock::now();

std::cout << (end - start).count() << " with NSUMS=" << NSUMS << ", INNER=" << INNER << std::endl;

return 0;
}``````

### 結果

``````sumspeed\$ for n in 2 4 8 16; do for inner in 0 1; do make -s clean && make -s NSUMS=\$n INNER=\$inner && (perf stat ./sum_groups < groups_shuffled && perf stat ./sum_groups < groups_sorted)  2>&1 | egrep '^[0-9]|frontend'; done; done1130558787 with NSUMS=2, INNER=0
915 158 411      stalled-cycles-frontend   #   16,96% frontend cycles idle
1351420957 with NSUMS=2, INNER=0
1 589 408 901      stalled-cycles-frontend   #   26,21% frontend cycles idle
840071512 with NSUMS=2, INNER=1
1 053 982 259      stalled-cycles-frontend   #   23,26% frontend cycles idle
1391591981 with NSUMS=2, INNER=1
2 830 348 854      stalled-cycles-frontend   #   45,35% frontend cycles idle
1110302654 with NSUMS=4, INNER=0
890 869 892      stalled-cycles-frontend   #   16,68% frontend cycles idle
1145175062 with NSUMS=4, INNER=0
948 879 882      stalled-cycles-frontend   #   17,40% frontend cycles idle
822954895 with NSUMS=4, INNER=1
1 253 110 503      stalled-cycles-frontend   #   28,01% frontend cycles idle
929548505 with NSUMS=4, INNER=1
1 422 753 793      stalled-cycles-frontend   #   30,32% frontend cycles idle
1128735412 with NSUMS=8, INNER=0
921 158 397      stalled-cycles-frontend   #   17,13% frontend cycles idle
1120606464 with NSUMS=8, INNER=0
891 960 711      stalled-cycles-frontend   #   16,59% frontend cycles idle
800789776 with NSUMS=8, INNER=1
1 204 516 303      stalled-cycles-frontend   #   27,25% frontend cycles idle
805223528 with NSUMS=8, INNER=1
1 222 383 317      stalled-cycles-frontend   #   27,52% frontend cycles idle
1121644613 with NSUMS=16, INNER=0
886 781 824      stalled-cycles-frontend   #   16,54% frontend cycles idle
1108977946 with NSUMS=16, INNER=0
860 600 975      stalled-cycles-frontend   #   16,13% frontend cycles idle
911365998 with NSUMS=16, INNER=1
1 494 671 476      stalled-cycles-frontend   #   31,54% frontend cycles idle
898729229 with NSUMS=16, INNER=1
1 474 745 548      stalled-cycles-frontend   #   31,24% frontend cycles idle   ``````