在网上找数据集的时候无意中看到了有这么个比赛,伴随着好奇的心里以及尝试的心态就参加试一试。这次比赛出现了一些让我意想不到的结果特此记录一下。
基于中国联通的大数据能力,通过使用对联通的信令数据、通话数据、互联网行为等数据进行建模,对个人是否会返乡工作进行判断。以上为官网的背景介绍。以论文的角度看应该是要参考一些人口流动预测的场景。
数据集介绍:
这个赛题已经有大佬在网上开源了baseline方案。大佬的解决方案的大致思路是:
train_data['f47'] = train_data['f1'] * 10 + train_data['f2']
test_data['f47'] = test_data['f1'] * 10 + test_data['f2']
loc_f = ['f1', 'f2', 'f4', 'f5', 'f6']
for df in [train_data, test_data]:for i in range(len(loc_f)):for j in range(i + 1, len(loc_f)):df[f'{loc_f[i]}+{loc_f[j]}'] = df[loc_f[i]] + df[loc_f[j]]df[f'{loc_f[i]}-{loc_f[j]}'] = df[loc_f[i]] - df[loc_f[j]]df[f'{loc_f[i]}*{loc_f[j]}'] = df[loc_f[i]] * df[loc_f[j]]df[f'{loc_f[i]}/{loc_f[j]}'] = df[loc_f[i]] / (df[loc_f[j]]+1)com_f = ['f43', 'f44', 'f45', 'f46']
for df in [train_data, test_data]:for i in range(len(com_f)):for j in range(i + 1, len(com_f)):df[f'{com_f[i]}+{com_f[j]}'] = df[com_f[i]] + df[com_f[j]]df[f'{com_f[i]}-{com_f[j]}'] = df[com_f[i]] - df[com_f[j]]df[f'{com_f[i]}*{com_f[j]}'] = df[com_f[i]] * df[com_f[j]]df[f'{com_f[i]}/{com_f[j]}'] = df[com_f[i]] / (df[com_f[j]]+1)
cat_columns = ['f3']
data = pd.concat([train_data, test_data])for col in cat_columns:lb = LabelEncoder()lb.fit(data[col])train_data[col] = lb.transform(train_data[col])test_data[col] = lb.transform(test_data[col])
num_columns = [ col for col in train_data.columns if col not in ['id', 'label', 'f3']]
feature_columns = num_columns + cat_columns
target = 'label'train = train_data[feature_columns]
label = train_data[target]
test = test_data[feature_columns]
print(train.shape)
print(train)
features = [i for i in train.columns if i not in ['label', 'id']]
y = train['label']
KF = StratifiedKFold(n_splits=5, random_state=2021, shuffle=True)
feat_imp_df = pd.DataFrame({'feat': features, 'imp': 0})
params = {'objective': 'binary','boosting_type': 'gbdt','metric': 'auc','n_jobs': 30,'learning_rate': 0.05,'num_leaves': 2 ** 6,'max_depth': 8,'tree_learner': 'serial','colsample_bytree': 0.8,'subsample_freq': 1,'subsample': 0.8,'num_boost_round': 5000,'max_bin': 255,'verbose': -1,'seed': 2021,'bagging_seed': 2021,'feature_fraction_seed': 2021,'early_stopping_rounds': 100,}oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros((len(test)))
# 模型训练
for fold_, (trn_idx, val_idx) in enumerate(KF.split(train.values, y.values)):print("fold n°{}".format(fold_))trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=y.iloc[trn_idx])val_data = lgb.Dataset(train.iloc[val_idx][features], label=y.iloc[val_idx])num_round = 3000clf = lgb.train(params,trn_data,num_round,valid_sets=[trn_data, val_data],verbose_eval=100,early_stopping_rounds=50,)oof_lgb[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)predictions_lgb[:] += clf.predict(test[features], num_iteration=clf.best_iteration) / 5feat_imp_df['imp'] += clf.feature_importance() / 5print("AUC score: {}".format(roc_auc_score(y, oof_lgb)))
print("F1 score: {}".format(f1_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))
print("Precision score: {}".format(precision_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))
print("Recall score: {}".format(recall_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))
这里就不放完整的大佬代码了,有需要的可以去赛题的评论区里面自行获取。
以上是大佬们开源的解决方案,就我个人感觉方案存在一些小问题(虽然我的排名非常垃圾可是我依旧想这样说一下我的看法):
以上是我对场景的理解。也就是说我认为解决预测问题的需要一种可以自适应的挖掘出特征之间对目标的重要程度,并且可以使用未标注数据集,而且还可以自动挖掘出样本之间的相关性的模型。
引入库文件
import os
import numpy as np
import cv2
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset,DataLoader
import torch_geometric
import pandas as pd
import networkx as ntx
import torch
import numpy as np
import torch.nn as nn
import math
import torch.nn.functional as F
import torch.nn as nn
import torch.nn.functional as F
from torchmetrics import Accuracy
from torchmetrics import AUROC
#from torchmetrics.classification import BinaryAccuracy
import torch
import numpy as np
import torch.nn as nn
import math
import torch.nn.functional as F
引入数据集
batchsize=1024##batchsize直接和图的邻接矩阵的规模相关不能太小
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class MyDataset(torch.utils.data.Dataset):def __init__(self, root,datatype,num=49858):name1= os.path.join(root,'dataTrain.csv')name2= os.path.join(root,'dataNoLabel.csv')name3 =os.path.join(root,'dataA.csv') name=''if(datatype=='train'):name=name1else:name=name3df=pd.read_csv(name)df1=pd.read_csv(name1)df2=pd.read_csv(name2)df3=pd.read_csv(name3)self.data2=df2.insert(loc=df2.shape[1], column='label', value=2)self.data3=df3.insert(loc=df3.shape[1], column='label', value=3)data = pd.DataFrame(df,columns=['f3'])dummies = pd.get_dummies(data)for index,row in df.iteritems():if(index not in ['id','label','f3']):a=np.min(np.array(df1[[index]]))b=np.max(np.array(df1[[index]]))df[[index]]=df[[index]].apply(lambda x : (x-a)/(b-a))for index, row in dummies.iteritems():# print(index) # print(dummies[index])# print(row)df.insert(loc=3, column=index, value=row.tolist())df=df.drop(columns='f3',inplace=False)##在里面f3字段是一个离散特征,我们将它转换为onehot编码print(df.shape)# self.traindata=df.sample(n=num, frac=None, replace=False, weights=None, random_state=None, axis=0)# print(self.traindata.shape)# self.testdata=df[~df.index.isin(self.traindata.index)]# print(self.testdata.shape)self.data=dfself.datatype=datatypedef getdata(self,index,df):a=df.iloc[index,1:49].values.tolist()b=df.iloc[index,49:].values.tolist()a = [float(i) for i in a]b = [float(i) for i in b]X=torch.tensor(a,dtype=torch.float32)# X=X.unsqueeze(-1)Y=torch.tensor(b,dtype=torch.float32)return X,Ydef __getitem__(self, index): samples, labels=self.getdata(index,self.data) sample=[samples,labels]return sampledef __len__(self):return self.data.shape[0]traindata=MyDataset(root='./data/person/',datatype='train')
print(len(traindata))
testdata=MyDataset(root='./data/person/',datatype='test')
print(len(testdata))
train_size = int(len(traindata) * 0.9)
test_size = len(traindata) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(traindata, [train_size, test_size])
train_loader = DataLoader(train_dataset, batch_size=batchsize, shuffle=False)
print(len(train_dataset))
print(len(test_dataset))
print(len(testdata))
#for step, (input,label) in enumerate(train_loader):#print(input.shape)#print(label.shape)
print('+++++++++++++++++++++test++++++++++++++++++++')
test_loader = DataLoader(test_dataset, batch_size=batchsize, shuffle=False)
#for step, (input,label) in enumerate(test_loader):#print(input.shape)#print(label.shape)
多头注意力机制
class selfAttention(nn.Module) :def __init__(self, num_attention_heads, input_size, hidden_size):super(selfAttention, self).__init__()if hidden_size % num_attention_heads != 0 :raise ValueError("the hidden size %d is not a multiple of the number of attention heads""%d" % (hidden_size, num_attention_heads))self.num_attention_heads = num_attention_headsself.attention_head_size = int(hidden_size / num_attention_heads)self.all_head_size = hidden_sizeself.key_layer = nn.Linear(input_size, hidden_size)self.query_layer = nn.Linear(input_size, hidden_size)self.value_layer = nn.Linear(input_size, hidden_size)def trans_to_multiple_heads(self, x):new_size = x.size()[ : -1] + (self.num_attention_heads, self.attention_head_size)x = x.view(new_size)return x.permute(0, 2, 1, 3)def forward(self, x):key = self.key_layer(x)query = self.query_layer(x)value = self.value_layer(x)#kqvkey_heads = self.trans_to_multiple_heads(key)query_heads = self.trans_to_multiple_heads(query)value_heads = self.trans_to_multiple_heads(value)attention_scores = torch.matmul(query_heads, key_heads.permute(0, 1, 3, 2))attention_scores = attention_scores / math.sqrt(self.attention_head_size)attention_probs = F.softmax(attention_scores, dim = -1)context = torch.matmul(attention_probs, value_heads)context = context.permute(0, 2, 1, 3).contiguous()new_size = context.size()[ : -2] + (self.all_head_size , )context = context.view(*new_size)return context
图学习层
class attgraNet(nn.Module):def __init__(self,inputsize,batchsize,k):super(attgraNet, self).__init__()self.k=kself.batchsize=batchsizeself.inputsize=inputsizeself.fc1 = nn.Linear(inputsize,inputsize*10)self.att = selfAttention(4,10,48)#self.para = torch.nn.Parameter(torch.ones([2,batchsize]), requires_grad=True)#四层GATself.gat1=torch_geometric.nn.GATConv(48*48,16,16,dropout=0.6)self.act1=nn.LeakyReLU(0.1)self.gat2=torch_geometric.nn.GATConv(256,8,8,dropout=0.6)self.act2=nn.LeakyReLU(0.1)self.gat3=torch_geometric.nn.GATConv(64,8,8,dropout=0.6)self.act3=nn.LeakyReLU(0.1)self.gat4=torch_geometric.nn.GATConv(64,16,16,dropout=0.6)self.act4=nn.LeakyReLU(0.1)self.fc2 = nn.Sequential(nn.Linear(16*16, 84),nn.LeakyReLU(0.1),nn.BatchNorm1d(84))self.fc3 = nn.Linear(84, 2)def forward(self, x):#print(x.size())print(x.device)x = self.fc1(x)x = x.unsqueeze(-1)x = x.reshape(-1,self.inputsize,10)#print(x.size())x = self.att(x)x = x.reshape(x.shape[0],-1,1)#print(x.size())a=xdim0, dim1,dim2 = a.shape#print(dim0)#print(dim1)#print(dim2)para = torch.ones([2,a.shape[0]],dtype=torch.long).to(device)#print(para.shape)for i in range(dim0):score=torch.zeros(dim0)for j in range(dim0):if(i!=j):#print(a[i].shape)score[j]=torch.abs(torch.cosine_similarity(a[i], a[j], dim=0))#print(score)#print(torch.argmax(score, dim=0))for j in range(self.k):idx=torch.argmax(score, dim=0)para.data[0][i]=ipara.data[1][i]=idxscore[idx]=0#构造邻接矩阵x = x.reshape(dim0,-1)data = torch_geometric.data.Data(x=x, edge_index=para.long()).to(device)#print(data.x.shape)#print(data.edge_index.shape)#print(data.x.is_cuda)#print(data.edge_index.is_cuda)x = self.gat1(data.x,data.edge_index)x = self.act1(x)#print(x.shape)#print(data.edge_index.shape)x = self.gat2(x,data.edge_index)#print(x.shape)#print(data.edge_index.shape)x = self.act2(x)x = self.gat3(x,data.edge_index)x = self.act3(x)x = self.gat4(x,data.edge_index)x = self.act4(x)x = self.fc2(x)x = F.dropout(x, training=self.training)x = self.fc3(x)return F.log_softmax(x,dim=1)
训练
net = attgraNet(48,batchsize,5)
print(net)
net.to(device)
print(next(net.parameters()).device)
criterion = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(net.parameters())
print(torch.cuda.is_available())
def train(epoch,model): model.train() running_loss = 0.0 for i,(X, y) in enumerate(train_loader):input = X.to(device) y=y.to(device)output = model.forward(input) #print(output.shape)#print(y.squeeze(dim=1).shape)loss = criterion(output , y.squeeze(dim=1).long())print("[{}, {}] loss %{}':".format(epoch,i,loss))running_loss += loss.item()optimizer.zero_grad()loss.backward()optimizer.step()epoch_loss_train = running_loss / (len(train_dataset)/batchsize) print(epoch_loss_train)return epoch_loss_train
测试
def val(model):model.eval() running_loss = 0.0 n=0result=0resauroc=0metric = Accuracy(top_k=1) #auroc = AUROC(num_classes=2)#应该以AUC来评判,但是我如果使用torchmetric中的AUC汇报GPU资源不够的错误 with torch.no_grad(): for i, (data,y) in enumerate(test_loader):input = data.to(device) y=y.to(device)optimizer.zero_grad() output = model.forward(input)#print(y.squeeze(dim=1).long().shape)#print(output.shape)loss= criterion(output,y.squeeze(dim=1).long())print("[{}] loss %{}':".format(i,loss))n=n+1res=metric(output.cpu(),y.squeeze(dim=1).long().cpu())#res1=AUROC(output.cpu(),y.squeeze(dim=1).long().cpu())print("[{}] ACC %{}':".format(i,res))#print("[{}] AUROC %{}':".format(i,res1))result=res+result#resauroc=res1+resaurocrunning_loss += loss.item()epoch_loss_val = running_loss / (len(test_dataset)/batchsize) print(epoch_loss_val)print(result/n)#print(resauroc/n)return epoch_loss_valval(net)
其余代码
def main(model):min_loss = 100000000.0 loss_train = [] loss_val = [] epochs=200since = time.time() for epoch in range(epochs):epoch_loss_train = train(epoch,model) loss_train.append(epoch_loss_train)epoch_loss_val = val(model) loss_val.append(epoch_loss_val)if epoch_loss_val < min_loss:min_loss = epoch_loss_val best_model_wts = model.state_dict()#torch.save(best_model_wts,os.path.join(parameter_address, experiment_name + '.pkl')) torch.save(model.state_dict(),'bestsaveBIG.pt')model_wts = model.state_dict() #torch.save(model_wts,os.path.join(parameter_address, experiment_name + "_" + str(epoch) + '.pkl')) time_elapsed = time.time() - since #torch.save(model,str(epoch)+'.pt')torch.save(model.state_dict(),'lastsaveBIG.pt')
if __name__ == "__main__":main(net)print('train finish')
Tips