首页 > 其他分享 >Otto Group Product Classification

Otto Group Product Classification

时间:2023-01-18 18:11:37浏览次数:47  
标签:__ Product Group Classification self torch train test data

遇到的坑:

  1. 做多分类,用CrossEntropyLoss时,训练时候的正确标签的范围应该是[0,n-1],而不是[1,n],不然会报 IndexError: Target is out of bounds
    比如这题就应该预处理为[0,8],而不是[1,9]
  2. pd.read_csv以后得到data,然后np.array(data)里面就已经不包括原本csv文件里第一行的名称了
  3. 关于read_csv用相对路径读不到,用绝对路径就读的到的问题,应该是vscode workspace的问题,在vscode里打开文件即可

疑问:
读进来的数据数据都要变成np.float32形式?

import torch
from torchvision import transforms
from torchvision import datasets
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd

batch_size=64
#归一化 均值和方差?
transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,),(0.3081,))])

def process_label(labels):
    ret=[]
    for label in labels:
        ret.append(int(label[-1])-1)
    ret=torch.tensor(ret)
    return ret

class otto(Dataset):
    def __init__(self,filepath):
        data=pd.read_csv(filepath)
        #print(data)
        labels=data['target']

        self.len=data.shape[0]
        self.x_data=torch.tensor(np.array(data)[:,1:-1].astype(np.float32))
        self.y_data=process_label(labels)
    
    def __getitem__(self,index):
        return self.x_data[index],self.y_data[index]

    def __len__(self):
        return self.len
        
# train_dataset=datasets.MNIST(root='./dataset/mnist/',train=True,download=True,transform=transform)
# train_loader=DataLoader(train_dataset,shuffle=True,batch_size=batch_size)
# test_dataset=datasets.MNIST(root='./dataset/mnist/',train=False,download=True,transform=transform)
# test_loader=DataLoader(test_dataset,shuffle=False,batch_size=batch_size)

dataset=otto('/Users/zzy81/Desktop/py/62/9/train.csv')
train_loader=DataLoader(dataset=dataset,batch_size=64,shuffle=True,num_workers=0)

class Net(torch.nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        self.l1=torch.nn.Linear(93,70)
        self.l2=torch.nn.Linear(70,60)
        self.l3=torch.nn.Linear(60,40)
        self.l4=torch.nn.Linear(40,20)
        self.l5=torch.nn.Linear(20,9)

    def forward(self,x):
       # x=x.view(-1,93)
        x=F.relu(self.l1(x))
        x=F.relu(self.l2(x))
        x=F.relu(self.l3(x))
        x=F.relu(self.l4(x))
        return self.l5(x)
    
    def solve(self,x):
        with torch.no_grad():
            x=F.relu(self.l1(x))
            x=F.relu(self.l2(x))
            x=F.relu(self.l3(x))
            x=F.relu(self.l4(x))
            x=self.l5(x)
          #  x=F.relu(self.l5(x)) #need to be changed to softmax
            _,tmp=torch.max(x,dim=1)
            tmp=pd.get_dummies(tmp)
            return tmp

model=Net()

criterion=torch.nn.CrossEntropyLoss()
optimizer=optim.SGD(model.parameters(),lr=0.01,momentum=0.5)

def train(epoch):

    running_loss=0.0
    for batch_idx,data in enumerate(train_loader): 
        inputs,target=data
        optimizer.zero_grad()
        outputs=model(inputs)
        loss=criterion(outputs,target)
        loss.backward()
        optimizer.step()

        running_loss+=loss.item()
        if batch_idx%300==299:
            print('[%d, %5d] loss:%.3f'%(epoch+1,batch_idx+1,running_loss/300))
            running_loss=0.0

# def test():
#     correct=0
#     total=0
#     with torch.no_grad():
#         for data in test_loader:
#             images,labels=data
#             outputs=model(images)
#             _,predicted=torch.max(outputs.data,dim=1)
#             total+=labels.size(0)
#             correct+=(predicted==labels).sum().item()
#     print('accuracy on test set : %d %%' % (100*correct/total))

if __name__=='__main__':
    for epoch in range(3000):
        train(epoch)
    test_data=pd.read_csv('./test.csv')
    test_input=torch.tensor(np.array(test_data)[:,1:].astype(np.float32))
    output=model.solve(test_input)
    output.columns=['Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9']
    output.insert(0,'id',test_data['id'])
    tmp=pd.DataFrame(output)
    tmp.to_csv('./zzy_predict.csv',index=False)
       # test()

标签:__,Product,Group,Classification,self,torch,train,test,data
From: https://www.cnblogs.com/zzythebest/p/17058944.html

相关文章