# c++ - swift教程pdf - swift进阶

## 为什么在单独的循环中元素添加比在组合循环中快得多? (7)

OP的代码：

``````const int n=100000;

for(int j=0;j<n;j++){
a1[j] += b1[j];
c1[j] += d1[j];
}``````

``````for(int j=0;j<n;j++){
a1[j] += b1[j];
}
for(int j=0;j<n;j++){
c1[j] += d1[j];
}``````

• 我们将让循环及其迭代成为一个从1开始并以100000结束而不是从循环开始的Summation，因为我们不需要担心内存寻址的0索引方案，因为我们只对它感兴趣算法本身。
• 在这两种情况下，我们有4个函数可以使用，2个函数调用，每个函数调用都有2个操作。因此，我们将设置这些函数和函数调用是`F1()``F2()``f(a)``f(b)``f(c)``f(d)`

``````Sum n=1 : [1,100000] = F1(), F2();
F1() = { f(a) = f(a) + f(b); }
F2() = { f(c) = f(c) + f(d);  }``````

``````Sum1 n=1 : [1,100000] = F1();
F1() = { f(a) = f(a) + f(b); }

Sum2 n=1 : [1,100000] = F1();
F1() = { f(c) = f(c) + f(d); }``````

``````const n = 100000
distTraveledOfFirst = (100 + 500) + ((n-1)*(500 + 500);
// Simplify
distTraveledOfFirst = 600 + (99999*100);
distTraveledOfFirst = 600 + 9999900;
distTraveledOfFirst =  10000500;
// Distance Traveled On First Algorithm = 10,000,500ft

distTraveledOfSecond = 100 + 500 = 600;
// Distance Traveled On Second Algorithm = 600ft;    ``````

OPs修订问题

``````DeltaTimeDifference approximately = Loop1(time) - Loop2(time)
//where
Loop1(time) = Loop2(time) + (Loop2(time)*[0.6,0.7]) // approximately
// So when we substitute this back into the difference equation we end up with
DeltaTimeDifference approximately = (Loop2(time) + (Loop2(time)*[0.6,0.7])) - Loop2(time)
// And finally we can simplify this to
DeltaTimeDifference approximately = [0.6,0.7]*(Loop2(time)``````

``````const int n = 100000;

for (int j = 0; j < n; j++) {
a1[j] += b1[j];
c1[j] += d1[j];
}``````

``````for (int j = 0; j < n; j++) {
a1[j] += b1[j];
}

for (int j = 0; j < n; j++) {
c1[j] += d1[j];
}``````

PS：我不确定，如果这有帮助：

``````movsd       xmm0,mmword ptr [edx+18h]
movsd       mmword ptr [ecx+20h],xmm0
movsd       xmm0,mmword ptr [esi+10h]
movsd       mmword ptr [eax+30h],xmm0
movsd       xmm0,mmword ptr [edx+20h]
movsd       mmword ptr [ecx+28h],xmm0
movsd       xmm0,mmword ptr [esi+18h]

``````addsd       xmm0,mmword ptr [eax+28h]
movsd       mmword ptr [eax+28h],xmm0
movsd       xmm0,mmword ptr [ecx+20h]
movsd       mmword ptr [eax+30h],xmm0
movsd       xmm0,mmword ptr [ecx+28h]
movsd       mmword ptr [eax+38h],xmm0
movsd       xmm0,mmword ptr [ecx+30h]
movsd       mmword ptr [eax+40h],xmm0``````

PPS：这是完整的代码。 它使用TBB `Tick_Count`来实现更高分辨率的时序，可以通过不定义`TBB_TIMING`宏来禁用它：

``````#include <iostream>
#include <iomanip>
#include <cmath>
#include <string>

//#define TBB_TIMING

#ifdef TBB_TIMING
#include <tbb/tick_count.h>
using tbb::tick_count;
#else
#include <time.h>
#endif

using namespace std;

//#define preallocate_memory new_cont

enum { new_cont, new_sep };

double *a1, *b1, *c1, *d1;

void allo(int cont, int n)
{
switch(cont) {
case new_cont:
a1 = new double[n*4];
b1 = a1 + n;
c1 = b1 + n;
d1 = c1 + n;
break;
case new_sep:
a1 = new double[n];
b1 = new double[n];
c1 = new double[n];
d1 = new double[n];
break;
}

for (int i = 0; i < n; i++) {
a1[i] = 1.0;
d1[i] = 1.0;
c1[i] = 1.0;
b1[i] = 1.0;
}
}

void ff(int cont)
{
switch(cont){
case new_sep:
delete[] b1;
delete[] c1;
delete[] d1;
case new_cont:
delete[] a1;
}
}

double plain(int n, int m, int cont, int loops)
{
#ifndef preallocate_memory
allo(cont,n);
#endif

#ifdef TBB_TIMING
tick_count t0 = tick_count::now();
#else
clock_t start = clock();
#endif

if (loops == 1) {
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++){
a1[j] += b1[j];
c1[j] += d1[j];
}
}
} else {
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
a1[j] += b1[j];
}
for (int j = 0; j < n; j++) {
c1[j] += d1[j];
}
}
}
double ret;

#ifdef TBB_TIMING
tick_count t1 = tick_count::now();
ret = 2.0*double(n)*double(m)/(t1-t0).seconds();
#else
clock_t end = clock();
ret = 2.0*double(n)*double(m)/(double)(end - start) *double(CLOCKS_PER_SEC);
#endif

#ifndef preallocate_memory
ff(cont);
#endif

return ret;
}

void main()
{
freopen("C:\\test.csv", "w", stdout);

char *s = " ";

string na[2] ={"new_cont", "new_sep"};

cout << "n";

for (int j = 0; j < 2; j++)
for (int i = 1; i <= 2; i++)
#ifdef preallocate_memory
cout << s << i << "_loops_" << na[preallocate_memory];
#else
cout << s << i << "_loops_" << na[j];
#endif

cout << endl;

long long nmax = 1000000;

#ifdef preallocate_memory
allo(preallocate_memory, nmax);
#endif

for (long long n = 1L; n < nmax; n = max(n+1, long long(n*1.2)))
{
const long long m = 10000000/n;
cout << n;

for (int j = 0; j < 2; j++)
for (int i = 1; i <= 2; i++)
cout << s << plain(n, m, j, i);
cout << endl;
}
}``````

（它显示不同`n`值的FLOP / s。）

PS：我改变了循环以倒数到零，并且组合方法稍微快一点。 抓我的头。 请注意新的数组大小和循环计数。

``````// MemBufferMystery.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
#include <iostream>
#include <cmath>
#include <string>
#include <time.h>

#define  dbl    double
#define  MAX_ARRAY_SZ    262145    //16777216    // AKA (2^24)
#define  STEP_SZ           1024    //   65536    // AKA (2^16)

int _tmain(int argc, _TCHAR* argv[]) {
long i, j, ArraySz = 0,  LoopKnt = 1024;
time_t start, Cumulative_Combined = 0, Cumulative_Separate = 0;
dbl *a = NULL, *b = NULL, *c = NULL, *d = NULL, *InitToOnes = NULL;

a = (dbl *)calloc( MAX_ARRAY_SZ, sizeof(dbl));
b = (dbl *)calloc( MAX_ARRAY_SZ, sizeof(dbl));
c = (dbl *)calloc( MAX_ARRAY_SZ, sizeof(dbl));
d = (dbl *)calloc( MAX_ARRAY_SZ, sizeof(dbl));
InitToOnes = (dbl *)calloc( MAX_ARRAY_SZ, sizeof(dbl));
// Initialize array to 1.0 second.
for(j = 0; j< MAX_ARRAY_SZ; j++) {
InitToOnes[j] = 1.0;
}

// Increase size of arrays and time
for(ArraySz = STEP_SZ; ArraySz<MAX_ARRAY_SZ; ArraySz += STEP_SZ) {
a = (dbl *)realloc(a, ArraySz * sizeof(dbl));
b = (dbl *)realloc(b, ArraySz * sizeof(dbl));
c = (dbl *)realloc(c, ArraySz * sizeof(dbl));
d = (dbl *)realloc(d, ArraySz * sizeof(dbl));
// Outside the timing loop, initialize
// b and d arrays to 1.0 sec for consistent += performance.
memcpy((void *)b, (void *)InitToOnes, ArraySz * sizeof(dbl));
memcpy((void *)d, (void *)InitToOnes, ArraySz * sizeof(dbl));

start = clock();
for(i = LoopKnt; i; i--) {
for(j = ArraySz; j; j--) {
a[j] += b[j];
c[j] += d[j];
}
}
Cumulative_Combined += (clock()-start);
printf("\n %6i miliseconds for combined array sizes %i and %i loops",
(int)(clock()-start), ArraySz, LoopKnt);
start = clock();
for(i = LoopKnt; i; i--) {
for(j = ArraySz; j; j--) {
a[j] += b[j];
}
for(j = ArraySz; j; j--) {
c[j] += d[j];
}
}
Cumulative_Separate += (clock()-start);
printf("\n %6i miliseconds for separate array sizes %i and %i loops \n",
(int)(clock()-start), ArraySz, LoopKnt);
}
printf("\n Cumulative combined array processing took %10.3f seconds",
(dbl)(Cumulative_Combined/(dbl)CLOCKS_PER_SEC));
printf("\n Cumulative seperate array processing took %10.3f seconds",
(dbl)(Cumulative_Separate/(dbl)CLOCKS_PER_SEC));
getchar();

free(a); free(b); free(c); free(d); free(InitToOnes);
return 0;
}``````

@Mysticial的回答让很多人（包括我）相信，可能是因为它是唯一一个似乎依赖于事实的人，但它只是事实的一个“数据点”。

1. 它在一个和两个循环版本之间存在最大的差异（几乎是三分之一）

2. 这是唯一的一点，其中一个循环（即连续分配）胜过双循环版本。 （这使得Mysticial的答案成为可能。）

## 提案

``````for(int j=0;j<n;j++){
a[j] += b[j];
}
for(int j=0;j<n;j++){
c[j] += d[j];
}``````

``````for(int j=0;j<n;j++){
a[j] += b[j];
c[j] += d[j];
}``````

`n = 2` ，我们正在使用字节。 因此，在我的场景中，我们只有4个字节的RAM ，其余的内存明显变慢（比如说访问时间长100倍）。

• ``````for(int j=0;j<n;j++){
a[j] += b[j];
}
for(int j=0;j<n;j++){
c[j] += d[j];
}``````
• 缓存`a[0]``a[1]`然后`b[0]``b[1]`并在缓存中设置`a[0] = a[0] + b[0]` - 缓存中现在有4个字节， `a[0], a[1]``b[0], b[1]` 。 成本= 100 + 100。

• 在缓存中设置`a[1] = a[1] + b[1]` 。 成本= 1 + 1。
• 重复`c``d`
• 总费用= `(100 + 100 + 1 + 1) * 2 = 404`

• ``````for(int j=0;j<n;j++){
a[j] += b[j];
c[j] += d[j];
}``````
• 缓存`a[0]``a[1]`然后`b[0]``b[1]`并在缓存中设置`a[0] = a[0] + b[0]` - 缓存中现在有4个字节， `a[0], a[1]``b[0], b[1]` 。 成本= 100 + 100。

• 从缓存和缓存`c[0]``c[1]`弹出`a[0], a[1], b[0], b[1]` `c[1]`然后`d[0]``d[1]`并设置`c[0] = c[0] + d[0]`缓存中的`c[0] = c[0] + d[0]` 。 成本= 100 + 100。
• 我怀疑你开始明白我要去哪里了。
• 总费用= `(100 + 100 + 100 + 100) * 2 = 800`