bert双塔

发布时间 2023-11-16 15:03:48作者: 15375357604
import sys

import tensorflow as tf
from keras import Input, Model
import keras.layers as layers

from keras.layers import Dot
from transformers import TFAutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

from tqdm import tqdm
df=pd.read_csv("data.txt",sep="\t")
# print(df.columns)
# df['label'][df['label']==0]=-1
# print(df.label.value_counts())

target_values=df["label"]

model_name = './bert_pretrained/bert'
tokenizer = AutoTokenizer.from_pretrained(model_name)
encoder = TFAutoModel.from_pretrained(model_name)


MAX_LEN = 50

def tokenize(x):
output=tokenizer(x,add_special_tokens=True,
max_length=MAX_LEN,
padding="max_length",
truncation=True,
return_tensors="tf")
return tf.convert_to_tensor(output["input_ids"] ,tf.int32) ,tf.convert_to_tensor(output["attention_mask"] ,tf.int32)

def batchTokenize(data):
toks=[]
atts=[]
for i in data:
x,y=tokenize(i)
toks.append(tf.reshape(x,[1,MAX_LEN]))
atts.append(tf.reshape(y,[1,MAX_LEN]))
toks=tf.concat(toks,0)
atts=tf.concat(atts,0)
return (toks, atts)

output=batchTokenize(df["query"].tolist())
toks1_input=output[0]
atts1_input=output[1]

output=batchTokenize(df["title"].tolist())
toks2_input=output[0]
atts2_input=output[1]

toks1=Input(shape=(MAX_LEN,), dtype='int64')
atts1=Input(shape=(MAX_LEN,),dtype="int64")
out1 = encoder({'input_ids': toks1, 'attention_mask': atts1})

toks2=Input(shape=(MAX_LEN,), dtype='int64')
atts2=Input(shape=(MAX_LEN,),dtype="int64")
out2 = encoder({'input_ids': toks2, 'attention_mask': atts2})

mean1=tf.reduce_mean(out1[0],1)
query_out = layers.Dense(128, activation='relu')(mean1)

mean2=tf.reduce_mean(out2[0],1)
item_out = layers.Dense(128, activation='relu')(mean2)
cosine_similarity=Dot(axes=1,normalize=True)
preds=cosine_similarity([query_out,item_out])

output = tf.keras.layers.Dense(1, activation='sigmoid')(preds)


model = Model(inputs=[toks1,atts1,toks2,atts2], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc'])
# model.compile(loss='mse', optimizer='nadam', metrics=['mse'])

model.fit([toks1_input,atts1_input,toks2_input,atts2_input],target_values,
epochs=2, batch_size=64,shuffle=True,validation_split=0.1)



# 创建第一个子模型
query_model = Model(inputs=[toks1, atts1], outputs=query_out)
query_model_path = "./models/query"
query_model.save(query_model_path, save_format="tf")

# 创建第二个子模型
item_model = Model(inputs=[toks2, atts2], outputs=item_out)
item_model_path = "./models/item"
item_model.save(item_model_path, save_format="tf")