基于BERT的文本分类(以2023年CCF的小样本数据分类任务为例)

发布时间 2023-11-09 09:43:44作者: tw_nlp

文本分类任务

1、问题分析  

  文本分类作为NLP的入门任务,因为其应用广泛被大家熟知,文本以今年的竞赛为例,详细的说明一下文本分类任务的实现步骤,竞赛如下:

  因为该赛题是一个文本分类的任务,我们需要知道该任务的输入和输出,数据介绍如下,根据数据我们可以看出文本的输入分别是专利的title、assignee和abstract三部分,对应的label是专利分类的标签,我们把这个任务看作为文本分类的任务,为此我们基于BERT来完成这个任务。

 2、代码实现

# 安装必要的库
!pip install transformers

 

# 数据加载
import torch
from torch.utils.data import Dataset

class CLSDataset(Dataset):
    def __init__(self, data_list, label_list, tokenizer, max_len):
        """

        :param data_list:
        :param label_list:
        :param tokenizer:
        :param max_len:
        """
        self.data_list = data_list
        self.label_list = label_list
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, item):
        data_item = self.data_list[item]
        label_item = self.label_list[item]
        inputs = self.tokenizer.encode_plus(data_item, None, add_special_tokens=True, max_length=self.max_len,
                                            pad_to_max_length=True,
                                            return_token_type_ids=True)
        return {
            "input_ids": torch.tensor(inputs['input_ids'], dtype=torch.long),
            "attention_mask": torch.tensor(inputs['attention_mask'], dtype=torch.long),
            "token_type_ids": torch.tensor(inputs['token_type_ids'], dtype=torch.long),
            "labels": torch.tensor(label_item)
        }

    def __len__(self):
        return len(self.data_list)
# 模型的训练
from tqdm import tqdm

def cls_train(train_loader, model, loss, optimizer, device):
    """

    :param train_loader:
    :param model:
    :param loss:
    :param optimizer:
    :param schedule:
    :param device:
    :return:
    """
    model.train()

    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        targets = batch['labels'].to(device)
        outputs1 = model(input_ids, attention_mask, token_type_ids)
        outputs1.loss.backward()

        optimizer.step()

# 模型的测试
from sklearn import metrics

def cls_test_macro(test_dataloader, model, device):
    """

    :param test_dataloader:
    :param model:
    :param device:
    :return:
    """

    model.eval()
    fin_targets = []
    fin_outputs = []

    with torch.no_grad():
        for idx, batch in enumerate(test_dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask, token_type_ids)
            fin_outputs.extend(outputs.logits.cpu().detach().numpy().argmax(1).tolist())
            fin_targets.extend(labels.cpu().detach().numpy().tolist())

    f1_macro = metrics.f1_score(fin_targets, fin_outputs, average='macro')
    return f1_macro
import json
from sklearn.model_selection import KFold, train_test_split
from transformers import BertTokenizer,set_seed,AdamW,optimization,BertForSequenceClassification
import numpy as np
from torch.utils.data import DataLoader
import torch.nn as nn

# 随机种子的设置
set_seed(42)

# 数据路径
train_path="./data/train_zl.json"
model_path="hfl/chinese-roberta-wwm-ext-large"
# 保存的路径
SAVE_PATH="./save_model/best.pth"

# tokenizer的构建
tokenizer=BertTokenizer.from_pretrained(model_path)

# 参数的定义
MAX_LEN = 512
train_batch_size = 4
dev_batch_size = 4
LEARNING_RATE = 2e-5
label_num = 36
device = "cuda"
EPOCHS = 20
n_splits = 5

# 数据集的构建
data_list=[]
label_list=[]

with open(train_path,encoding="utf-8") as file_read:
  for line in file_read.readlines():
    line_dict=json.loads(line)
    data_list.append(
                f"这份专利的标题为:《{line_dict['title']}》,由“{line_dict['assignee']}”公司申请,详细说明如下:{line_dict['abstract']}")
    label_list.append(int(line_dict['label_id']))
  file_read.close()

# 数据集的划分
MAX_ACC = 0
X_train, X_test, y_train, y_test=train_test_split(data_list,label_list)
train_list = X_train
train_label_ = y_train

dev_text = X_test
dev_label = y_test


model=BertForSequenceClassification.from_pretrained(model_path,num_labels=label_num)
model.to(device)

train_dataset = CLSDataset(train_list, train_label_, tokenizer, MAX_LEN)
dev_dataset = CLSDataset(dev_text, dev_label, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
test_loader = DataLoader(dev_dataset, batch_size=dev_batch_size, shuffle=False)

#loss
loss = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)for epoch in range(EPOCHS):
    cls_train(train_loader, model, loss, optimizer, scheduler, device)
    acc = cls_test_macro(test_loader, model, device)
    if acc > MAX_ACC:
        MAX_ACC = acc
        print(MAX_ACC)
        torch.save(model.state_dict(), SAVE_PATH)