1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201
|
import gluonbook as gb from mxnet import autograd,nd,gluon,init from mxnet.gluon import data as gdata,loss as gloss,nn import numpy as np import pandas as pd
train_data = pd.read_csv('./train.csv') test_data = pd.read_csv('./test.csv')
print(train_data.shape) print(test_data.shape)
''' iloc和loc的区别: iloc,完全基于位置的索引. iloc的用法完全和numpy中的数字索引一样,开闭区间的逻辑也和Python是相同的。 要注意的是,如果iloc方括号中直接给定一个数字或者一个slice的话,默认索引的是行。 其中数字的情况会返回一个Series iloc主要使用数字来索引数据,而不能使用字符型的标签来索引数据。 而loc则刚好相反,只能使用字符型标签来索引数据,不能使用数字来 索引数据,不过有特殊情况,当数据框dataframe的行标签或者列标 签为数字,loc就可以来其来索引。 ''' print(train_data.iloc[0:4,[0,1,2,3,-3,-2,-1]])
all_features = pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index ''' apply:TO-DO ''' all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
all_features = all_features.fillna(all_features.mean())
all_features = pd.get_dummies(all_features,dummy_na=True) print(all_features.shape)
n_train = train_data.shape[0] train_features = nd.array(all_features[:n_train].values)
test_features = nd.array(all_features[n_train:].values) train_labels = nd.array(train_data.SalePrice.values).reshape((-1,1))
loss = gloss.L2Loss()
def get_net(): net = nn.Sequential() net.add(nn.Dense(1)) net.initialize() return net
def log_rmse(net,features,labels): ''' Numpy中clip函数的使用.numpy.clip(a, a_min, a_max, out=None) 其中a是一个数组,后面两个参数分别表示最小和最大值,也就是说clip这个 函数将将数组中的元素限制在a_min, a_max之间,大于a_max的就使得它等 于 a_max,小于a_min,的就使得它等于a_min。 ''' clipped_preds = nd.clip(net(features),1,float('inf')) rmse = nd.sqrt(2 * loss(clipped_preds.log(),labels.log()).mean())
return rmse.asscalar()
def train(net,train_features,train_labels,test_features,test_labels, num_epochs,learning_rate,weight_decay,batch_size): train_ls,test_ls = [],[] train_iter = gdata.DataLoader(gdata.ArrayDataset(train_features,train_labels),batch_size,shuffle=True) trainer = gluon.Trainer(net.collect_params(),'adam',{'learning_rate':learning_rate,'wd':weight_decay}) for epoch in range(num_epochs): for X,y in train_iter: with autograd.record(): l = loss(net(X),y) l.backward() trainer.step(batch_size) train_ls.append(log_rmse(net,train_features,train_labels)) if test_labels is not None: test_ls.append(log_rmse(net,test_features,test_labels)) return train_ls,test_ls
def get_k_fold_data(k,i,X,y): assert k > 1 fold_size = X.shape[0] // k X_train,y_train = None,None for j in range(k): idx = slice(j * fold_size,(j + 1) * fold_size) X_part,y_part = X[idx,:],y[idx] if j == 1: X_valid,y_valid = X_part,y_part elif X_train is None: X_train,y_train = X_part,y_part else: X_train = nd.concat(X_train,X_part,dim=0) y_train = nd.concat(y_train,y_part,dim=0) return X_train,y_train,X_valid,y_valid
def k_fold(k,X_train,y_train,num_epochs,learning_rate,weight_decay,batch_size): train_l_sum,valid_l_sum = 0,0 for i in range(k): data = get_k_fold_data(k,i,X_train,y_train) net = get_net() train_ls,valid_ls = train(net,*data,num_epochs,learning_rate,weight_decay,batch_size) train_l_sum += train_ls[-1] valid_l_sum += valid_ls[-1] if i == 0: ''' 我们先定义作图函数semilogy,其中y轴使用了对数尺度。 def semilogy(x_vals, y_vals, x_label, y_label, x2_vals=None, y2_vals=None, legend=None, figsize=(3.5, 2.5)): gb.set_figsize(figsize) gb.plt.xlabel(x_label) gb.plt.ylabel(y_label) gb.plt.semilogy(x_vals, y_vals) if x2_vals and y2_vals: gb.plt.semilogy(x2_vals, y2_vals, linestyle=':') gb.plt.legend(legend) ''' gb.semilogy(range(1,num_epochs + 1),train_ls,'epochs','rmse',range(1,num_epochs +1),valid_ls,['train','valid']) print('fold %d,train rmse: %f,valid rmse: %f' % (i,train_ls[-1],valid_ls[-1]))
return train_l_sum / k,valid_l_sum / k
k,num_epochs,lr,weight_decay,batch_size = 10,165,5,0,60 train_l,valid_l = k_fold(k,train_features,train_labels,num_epochs,lr, weight_decay,batch_size) print('%d-fold validation: avg train rmse: %f,avg valid rmse: %f' % (k,train_l,valid_l)) ''' 有时候你会发现一组参数的训练误差可以达到很低,但是在K折交叉验证上的误差可能反而较高。 这种现象很可能是由于过拟合造成的。因此,当训练误差降低时,我们要观察K折交叉验证上的 误差是否也相应降低。 '''
def train_and_pred(train_features,test_features,train_labels,test_data, num_epochs,lr,weight_decay,batch_size): net = get_net() train_ls,_ = train(net,train_features,train_labels,None,None, num_epochs,lr,weight_decay,batch_size) gb.semilogy(range(1,num_epochs + 1),train_ls,'epochs','rmse') print('train rmse %f' % train_ls[-1]) preds = net(test_features).asnumpy() test_data['SalePrice'] = pd.Series(preds.reshape(1,-1)[0]) submission = pd.concat([test_data['Id'],test_data['SalePrice']],axis=1) submission.to_csv('submission.csv',index=False)
train_and_pred(train_features,test_features,train_labels,test_data, num_epochs,lr,weight_decay,batch_size) ''' 上述代码执行完之后会生成一个“submission.csv”文件。 这个文件是符合 Kaggle 比赛要求的提交格式的。这时, 我们可以在 Kaggle 上把我们预测得出的结果进行提交, 并且查看与测试数据集上真实房价(标签)的误差。具体 来说有以下几个步骤:你需要登录 Kaggle 网站,访问 房价预测比赛网页,并点击右侧“Submit Predictions” 或“Late Submission”按钮。然后,点击页面下方 “Upload Submission File”图标所在的虚线框选择 需要提交的预测结果文件。最后,点击页面最下方的 “Make Submission”按钮就可以查看结果了 '''
|