pandas合并pd.concat-pd.merge

pandas 合并数据常用的两个函数:

刚好有一个对比 RNN 和 LSTM 神经网络运行结果的场景分别用到了这两个函数:

  • 采用 concat 合并两个csv表; csv表为两次运行结果的,列名基本一致。
  • 采用 merge 合并模型运行条件类似的表, 用于对比数据
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#%%

import pandas as pd
import io

pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
# -------------
print('*** 基础数据 ***')
res1_csv = '''
model,steps,out_size,seed,epochs,rmse1p,r_2_1p,rmse2p,r_2_2p,rmse3p,r_2_3p
RNN,24,2,0,0,0.1131,0.8846,0.1681,0.7449,,
RNN,48,2,0,0,0.1119,0.8871,0.1694,0.7412,,
LSTM,24,2,0,0,0.1108,0.8894,0.166,0.7514,,
LSTM,48,2,0,0,0.1109,0.8892,0.1661,0.7511,,
RNN,24,3,0,0,0.1157,0.8797,0.1629,0.76,0.1978,0.6471
RNN,48,3,0,0,0.1159,0.8792,0.162,0.7628,0.1982,0.6455
LSTM,24,3,0,0,0.1126,0.886,0.1584,0.7732,0.1926,0.6653
LSTM,48,3,0,0,0.1125,0.8863,0.1587,0.7723,0.1933,0.6629
'''
df_res1 = pd.read_csv(io.StringIO(res1_csv))
print(df_res1)

#%%
# -------------
print('\n*** 对比数据 ***')
res2_csv = '''
model,steps,seed,epochs,rmse1,r_2_1,rmse2,r_2_2
LSTM,24,1,50,0.1110,0.8890,0.1661,0.7512
LSTM,24,2,50,0.1109,0.8892,0.1658,0.7520
LSTM,24,3,50,0.1118,0.8874,0.1667,0.7494
LSTM,24,4,50,0.1109,0.8891,0.1666,0.7496
LSTM,24,5,50,0.1127,0.8855,0.1662,0.7507
LSTM,24,6,50,0.1107,0.8896,0.1659,0.7518
LSTM,24,7,50,0.1110,0.8889,0.1659,0.7516
LSTM,24,8,50,0.1119,0.8872,0.1667,0.7493
LSTM,24,9,50,0.1109,0.8892,0.1657,0.7523
LSTM,48,1,50,0.1110,0.8891,0.1663,0.7504
LSTM,48,2,50,0.1106,0.8897,0.1658,0.7520
LSTM,48,3,50,0.1120,0.8869,0.1671,0.7481
LSTM,48,4,50,0.1117,0.8876,0.1677,0.7464
LSTM,48,5,50,0.1110,0.8890,0.1662,0.7508
LSTM,48,6,50,0.1145,0.8819,0.1683,0.7445
LSTM,48,7,50,0.1113,0.8883,0.1676,0.7467
LSTM,48,8,50,0.1106,0.8898,0.1665,0.7499
LSTM,48,9,50,0.1107,0.8896,0.1657,0.7524
RNN,24,1,50,0.1219,0.8660,0.1723,0.7321
RNN,24,2,50,0.1114,0.8882,0.1674,0.7471
RNN,24,3,50,0.1139,0.8830,0.1675,0.7468
RNN,24,4,50,0.1106,0.8898,0.1678,0.7461
RNN,24,5,50,0.1145,0.8818,0.1738,0.7275
RNN,24,6,50,0.1134,0.8842,0.1677,0.7463
RNN,24,7,50,0.1164,0.8778,0.1694,0.7412
RNN,24,8,50,0.1121,0.8867,0.1711,0.7358
RNN,24,9,50,0.1104,0.8901,0.1668,0.7490
RNN,48,1,50,0.1121,0.8868,0.1682,0.7447
RNN,48,2,50,0.1134,0.8842,0.1657,0.7524
RNN,48,3,50,0.1122,0.8865,0.1678,0.7459
RNN,48,4,50,0.1123,0.8864,0.1673,0.7476
RNN,48,5,50,0.1146,0.8817,0.1694,0.7411
RNN,48,6,50,0.1185,0.8735,0.1722,0.7325
RNN,48,7,50,0.1122,0.8865,0.1725,0.7317
RNN,48,8,50,0.1175,0.8756,0.1752,0.7231
RNN,48,9,50,0.1130,0.8849,0.1662,0.7509
'''

df_res2 = pd.read_csv(io.StringIO(res2_csv))
df_res2['out_size'] = 2


res3_csv = '''
model,steps,seed,epochs,rmse1,r_2_1,rmse2,r_2_2,rmse3,r_2_3
LSTM,24,1,50,0.114,0.8833,0.1588,0.7721,0.1937,0.6615
LSTM,24,2,50,0.1126,0.8861,0.1588,0.7721,0.1928,0.6648
LSTM,24,3,50,0.1138,0.8835,0.1585,0.7728,0.1931,0.6637
LSTM,24,4,50,0.1124,0.8864,0.1583,0.7735,0.1925,0.6657
LSTM,24,5,50,0.1131,0.8851,0.1581,0.7741,0.1922,0.6667
LSTM,24,6,50,0.1126,0.886,0.159,0.7715,0.1931,0.6636
LSTM,24,7,50,0.1129,0.8854,0.16,0.7684,0.1942,0.6599
LSTM,24,8,50,0.1121,0.887,0.1583,0.7734,0.1931,0.6635
LSTM,24,9,50,0.1126,0.8861,0.1584,0.7732,0.193,0.6639
LSTM,48,1,50,0.1119,0.8874,0.1578,0.7748,0.1928,0.6646
LSTM,48,2,50,0.1124,0.8865,0.1592,0.771,0.1948,0.6577
LSTM,48,3,50,0.1131,0.8851,0.1588,0.772,0.1932,0.6634
LSTM,48,4,50,0.1127,0.8858,0.158,0.7742,0.193,0.6639
LSTM,48,5,50,0.1124,0.8865,0.1583,0.7735,0.1932,0.6632
LSTM,48,6,50,0.1122,0.8869,0.1585,0.773,0.1933,0.663
LSTM,48,7,50,0.1135,0.8842,0.1599,0.7688,0.1934,0.6626
LSTM,48,8,50,0.1121,0.887,0.158,0.7744,0.1932,0.6632
LSTM,48,9,50,0.1132,0.8848,0.1591,0.7711,0.193,0.6639
RNN,24,1,50,0.1175,0.876,0.1597,0.7694,0.195,0.657
RNN,24,2,50,0.1153,0.8804,0.1597,0.7693,0.1946,0.6584
RNN,24,3,50,0.1176,0.8757,0.159,0.7714,0.1951,0.6567
RNN,24,4,50,0.1144,0.8823,0.159,0.7715,0.1949,0.6572
RNN,24,5,50,0.1146,0.8819,0.1599,0.769,0.196,0.6533
RNN,24,6,50,0.1168,0.8774,0.1611,0.7653,0.1954,0.6556
RNN,24,7,50,0.1192,0.8723,0.1594,0.7702,0.194,0.6604
RNN,24,8,50,0.1154,0.8803,0.1591,0.7712,0.1942,0.6599
RNN,24,9,50,0.1152,0.8806,0.1609,0.7659,0.1955,0.6551
RNN,48,1,50,0.116,0.879,0.1611,0.7653,0.1985,0.6445
RNN,48,2,50,0.113,0.8853,0.161,0.7657,0.1957,0.6544
RNN,48,3,50,0.115,0.8812,0.1589,0.7716,0.195,0.6568
RNN,48,4,50,0.1131,0.8851,0.159,0.7715,0.1945,0.6589
RNN,48,5,50,0.1141,0.883,0.1599,0.769,0.1953,0.6558
RNN,48,6,50,0.1156,0.8799,0.1596,0.7698,0.1952,0.6561
RNN,48,7,50,0.1169,0.8773,0.1626,0.761,0.1967,0.6508
RNN,48,8,50,0.1213,0.8678,0.1605,0.7672,0.1946,0.6583
RNN,48,9,50,0.1132,0.8848,0.1587,0.7723,0.1943,0.6594
'''

df_res3 = pd.read_csv(io.StringIO(res3_csv))
df_res3['out_size'] = 3
print(df_res2)
print(df_res3)

#%%
# concat
# res2、res3 分别为两次运行结果的
# 由于res3 比 res2 多两列, 合并后 res2 会有两列缺失的数据。
print('\n*** 合并对比数据 Nx2 Nx3 ***')
df_res = pd.concat((df_res2, df_res3)).reset_index(drop=True)
df_res

# 将列重新排序
df_res = df_res[['model', 'steps', 'out_size', 'seed', 'epochs',
'rmse1', 'r_2_1', 'rmse2', 'r_2_2', 'rmse3', 'r_2_3']]
# print(df_res.columns)
df_res

#%%
# merge
# 这里将res 与 res1 按照模型、步长、输出等条件进行合并
# 会产生扩展的列,不需要的列一并删掉

print('\n*** 拼接结果 ***')
df = pd.merge(df_res, df_res1, how='outer',
on=['model', 'steps', 'out_size'],
suffixes=('', '_DROP')).filter(regex='^(?!.*_DROP)')
df.info()
df = df.fillna(0)
df.head(10)