# java - 为什么处理排序数组比处理未排序数组更快?

## c++ performance (14)

#include <algorithm>
#include <ctime>
#include <iostream>

int main()
{
// Generate data
const unsigned arraySize = 32768;
int data[arraySize];

for (unsigned c = 0; c < arraySize; ++c)
data[c] = std::rand() % 256;

// !!! With this, the next loop runs faster
std::sort(data, data + arraySize);

// Test
clock_t start = clock();
long long sum = 0;

for (unsigned i = 0; i < 100000; ++i)
{
// Primary loop
for (unsigned c = 0; c < arraySize; ++c)
{
if (data[c] >= 128)
sum += data[c];
}
}

double elapsedTime = static_cast<double>(clock() - start) / CLOCKS_PER_SEC;

std::cout << elapsedTime << std::endl;
std::cout << "sum = " << sum << std::endl;
}
• 没有std::sort(data, data + arraySize); ，代码运行时间为11.54秒。
• 使用排序数据，代码运行1.93秒。

import java.util.Arrays;
import java.util.Random;

public class Main
{
public static void main(String[] args)
{
// Generate data
int arraySize = 32768;
int data[] = new int[arraySize];

Random rnd = new Random(0);
for (int c = 0; c < arraySize; ++c)
data[c] = rnd.nextInt() % 256;

// !!! With this, the next loop runs faster
Arrays.sort(data);

// Test
long start = System.nanoTime();
long sum = 0;

for (int i = 0; i < 100000; ++i)
{
// Primary loop
for (int c = 0; c < arraySize; ++c)
{
if (data[c] >= 128)
sum += data[c];
}
}

System.out.println((System.nanoTime() - start) / 1000000000.0);
System.out.println("sum = " + sum);
}
}

• 到底是怎么回事？
• 为什么处理排序数组比处理未排序数组更快？
• 代码总结了一些独立的术语，顺序无关紧要。

if (expression)
{
// Run 1
} else {
// Run 2
}

### 可视化：

O      Route 1  /-------------------------------
/|\             /
|  ---------##/
/ \            \
\
Route 2  \--------------------------------

if (likely( everything_is_ok ))
{
/* Do something */
}

if (unlikely(very_improbable_condition))
{
/* Do something */
}

int i= 0, j, k= arraySize;
while (i < k)
{
j= (i + k) >> 1;
if (data[j] >= 128)
k= j;
else
i= j;
}
sum= 0;
for (; i < arraySize; i++)
sum+= data[i];

int i, k, j= (i + k) >> 1;
for (i= 0, k= arraySize; i < k; (data[j] >= 128 ? k : i)= j)
j= (i + k) >> 1;
for (sum= 0; i < arraySize; i++)
sum+= data[i];

1. 英特尔 - 避免分支机构错误预测的成本
2. 英特尔 - 分支和循环重组以防止错误预测
3. 科学论文 - 分支预测计算机结构
4. 书籍：JL Hennessy，DA Patterson：计算机体系结构：定量方法
5. 科学出版物中的文章：TY Yeh，YN Patt在分支预测中做了很多这些。

data[c] = std::rand() % 256;

if (data[c] >= 128)
sum += data[c];

CC++ ，这个语句可以直接编译（没有任何优化）到x86的条件移动指令，是三元运算符... ? ... : ... ... ? ... : ... 所以我们将上面的语句重写为等价的语句：

sum += data[c] >=128 ? data[c] : 0;

86

//  Branch - Random
seconds = 8.885

//  Branch - Sorted
seconds = 1.528

//  Branchless - Random
seconds = 3.716

//  Branchless - Sorted
seconds = 3.71

64位

//  Branch - Random
seconds = 11.302

//  Branch - Sorted
seconds = 1.830

//  Branchless - Random
seconds = 2.736

//  Branchless - Sorted
seconds = 2.737

max1使用条件分支if... else ...

int max1(int a, int b) {
if (a > b)
return a;
else
return b;
}

max2使用三元运算符... ? ... : ... ... ? ... : ...

int max2(int a, int b) {
return a > b ? a : b;
}

:max1
movl    %edi, -4(%rbp)
movl    %esi, -8(%rbp)
movl    -4(%rbp), %eax
cmpl    -8(%rbp), %eax
jle     .L2
movl    -4(%rbp), %eax
movl    %eax, -12(%rbp)
jmp     .L4
.L2:
movl    -8(%rbp), %eax
movl    %eax, -12(%rbp)
.L4:
movl    -12(%rbp), %eax
leave
ret

:max2
movl    %edi, -4(%rbp)
movl    %esi, -8(%rbp)
movl    -4(%rbp), %eax
cmpl    %eax, -8(%rbp)
cmovge  -8(%rbp), %eax
leave
ret

1. 它是一个小桌子，可能会被缓存在处理器中
2. 您正在以非常紧凑的循环运行和/或处理器可以预加载数据

Pfew，那到底是什么意思呢？

for (int i=0; i<array.Length; ++i)
// Use array[i]

// Generate data
int arraySize = 32768;
int[] data = new int[arraySize];

Random rnd = new Random(0);
for (int c = 0; c < arraySize; ++c)
data[c] = rnd.Next(256);

//To keep the spirit of the code in-tact I'll make a separate lookup table
// (I assume we cannot modify 'data' or the number of loops)
int[] lookup = new int[256];

for (int c = 0; c < 256; ++c)
lookup[c] = (c >= 128) ? c : 0;

// Test
DateTime startTime = System.DateTime.Now;
long sum = 0;

for (int i = 0; i < 100000; ++i)
{
// Primary loop
for (int j = 0; j < arraySize; ++j)
{
// Here you basically want to use simple operations - so no
// random branches, but things like &, |, *, -, +, etc. are fine.
sum += lookup[data[j]];
}
}

DateTime endTime = System.DateTime.Now;
Console.WriteLine(endTime - startTime);
Console.WriteLine("sum = " + sum);

==32551== Branches:        656,645,130  (  656,609,208 cond +    35,922 ind)
==32551== Mispredicts:         169,556  (      169,095 cond +       461 ind)
==32551== Mispred rate:            0.0% (          0.0%     +       1.2%   )

==32555== Branches:        655,996,082  (  655,960,160 cond +  35,922 ind)
==32555== Mispredicts:     164,073,152  (  164,072,692 cond +     460 ind)
==32555== Mispred rate:           25.0% (         25.0%     +     1.2%   )

Bc    Bcm Bi Bim
10,001      4  0   0      for (unsigned i = 0; i < 10000; ++i)
.      .  .   .      {
.      .  .   .          // primary loop
327,690,000 10,016  0   0          for (unsigned c = 0; c < arraySize; ++c)
.      .  .   .          {
327,680,000 10,006  0   0              if (data[c] >= 128)
0      0  0   0                  sum += data[c];
.      .  .   .          }
.      .  .   .      }

Bc         Bcm Bi Bim
10,001           4  0   0      for (unsigned i = 0; i < 10000; ++i)
.           .  .   .      {
.           .  .   .          // primary loop
327,690,000      10,038  0   0          for (unsigned c = 0; c < arraySize; ++c)
.           .  .   .          {
327,680,000 164,050,007  0   0              if (data[c] >= 128)
0           0  0   0                  sum += data[c];
.           .  .   .          }
.           .  .   .      }

perf stat ./sumtest_sorted

Performance counter stats for './sumtest_sorted':

11808.095776 task-clock                #    0.998 CPUs utilized
1,062 context-switches          #    0.090 K/sec
14 CPU-migrations            #    0.001 K/sec
337 page-faults               #    0.029 K/sec
26,487,882,764 cycles                    #    2.243 GHz
41,025,654,322 instructions              #    1.55  insns per cycle
6,558,871,379 branches                  #  555.455 M/sec
567,204 branch-misses             #    0.01% of all branches

11.827228330 seconds time elapsed

Performance counter stats for './sumtest_unsorted':

28877.954344 task-clock                #    0.998 CPUs utilized
2,584 context-switches          #    0.089 K/sec
18 CPU-migrations            #    0.001 K/sec
335 page-faults               #    0.012 K/sec
65,076,127,595 cycles                    #    2.253 GHz
41,032,528,741 instructions              #    0.63  insns per cycle
6,560,579,013 branches                  #  227.183 M/sec
1,646,394,749 branch-misses             #   25.10% of all branches

28.935500947 seconds time elapsed

perf record -e branch-misses ./sumtest_unsorted
perf annotate -d sumtest_unsorted
Percent |      Source code & Disassembly of sumtest_unsorted
------------------------------------------------
...
:                      sum += data[c];
0.00 :        400a1a:       mov    -0x14(%rbp),%eax
39.97 :        400a1d:       mov    %eax,%eax
5.31 :        400a1f:       mov    -0x20040(%rbp,%rax,4),%eax
4.60 :        400a26:       cltq
...

____________________________________________________________________________________
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT

______   ________
|     |__|

• 分支预测器是古老的性能改进技术之一，它仍然与现代建筑相关。虽然简单的预测技术提供快速查找和功率效率，但是它们具有高的误预测率。

• 另一方面，复杂的分支预测 - 无论是基于神经的还是两级分支预测的变体 - 提供更好的预测准确性，但它们消耗更多的功率和复杂性以指数方式增加。

• 除此之外，在复杂的预测技术中，预测分支所花费的时间本身非常高，从2到5个周期 - 这与实际分支的执行时间相当。

• 分支预测本质上是一种优化（最小化）问题，其中重点在于以最少的资源实现最低可能的未命中率，低功耗和低复杂度。

// sort backwards (higher values first)
std::sort(data, data + arraySize, std::greater<int>());

for (unsigned c = 0; c < arraySize; ++c) {
if (data[c] < 128) {
break;
}
sum += data[c];
}

MOV R0, #0     // R0 = sum = 0
MOV R1, #0     // R1 = c = 0
ADR R2, data   // R2 = addr of data array (put this instruction outside outer loop)
.inner_loop    // Inner loop branch label
LDRB R3, [R2, R1]     // R3 = data[c]
CMP R3, #128          // compare R3 to 128
ADDGE R0, R0, R3      // if R3 >= 128, then sum += data[c] -- no branch needed!
ADD R1, R1, #1        // c++
CMP R1, #arraySize    // compare c to arraySize
BLT inner_loop        // Branch to inner_loop if c < arraySize

1. 没有分支预测器。

1. 使用分支预测器，不要进行条件跳转。让我们假设预测没有采取条件跳跃。

1. 使用分支预测器并进行条件跳转。让我们假设预测没有采取条件跳跃。