# python - scipy是什么 - scipy是做什么的

## 如何计算所有列之间的相关性,并使用python或pandas删除高度相关的列 (3)

`````` GA      PN       PC     MBP      GR     AP
0.033   6.652   6.681   0.194   0.874   3.177
0.034   9.039   6.224   0.194   1.137   3.4
0.035   10.936  10.304  1.015   0.911   4.9
0.022   10.11   9.603   1.374   0.848   4.566
0.035   2.963   17.156  0.599   0.823   9.406
0.033   10.872  10.244  1.015   0.574   4.871
0.035   21.694  22.389  1.015   0.859   9.259
0.035   10.936  10.304  1.015   0.911   4.5       ``````

``````def corr_df(x, corr_val):
'''
Obj: Drops features that are strongly correlated to other features.
This lowers model complexity, and aids in generalizing the model.
Inputs:
df: features df (x)
corr_val: Columns are dropped relative to the corr_val input (e.g. 0.8)
Output: df that only includes uncorrelated features
'''

# Creates Correlation Matrix and Instantiates
corr_matrix = x.corr()
iters = range(len(corr_matrix.columns) - 1)
drop_cols = []

# Iterates through Correlation Matrix Table to find correlated columns
for i in iters:
for j in range(i):
item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
col = item.columns
row = item.index
val = item.values
if val >= corr_val:
# Prints the correlated feature set and the corr val
print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
drop_cols.append(i)

drops = sorted(set(drop_cols))[::-1]

# Drops the correlated columns
for i in drops:
col = x.iloc[:, (i+1):(i+2)].columns.values
df = x.drop(col, axis=1)

return df``````

``````corr_matrix = df.corr().abs()
high_corr_var=np.where(corr_matrix>0.8)
high_corr_var=[(corr_matrix.columns[x],corr_matrix.columns[y]) for x,y in zip(*corr_matrix) if x!=y and x<y]``````