import torch from torch import nn from d2l import torch as d2l batch_size, num_steps = 32, 35 train_iter, vocab = d2l.load_data_time_machine(batch_size, num_steps) # 下一步是初始化模型参数。 我们从标准差为 的高斯分布中提取权重, # 并将偏置项设为 超参数num_hidden定义隐藏单元的数量, 实例化与更新门、重置门、候选隐状态和输出层相关的所有权重和偏置。 def get_params(vocab_size, num_hidden, device): num_inputs = num_outputs = vocab_size def normal(shape): return torch.randn(size=shape, device=device) def three(): return (normal((num_inputs, num_hidden)), normal((num_hidden, num_hidden)), torch.zeros(num_hidden, device=device)) w_xz, w_hz, b_z = three() # 更新门参数 w_xr, w_hr, b_r = three() # 重置门参数 w_xh, w_hh, b_h = three() # 候选隐状态参数 # 输出层参数 w_hq = normal((num_hidden, num_outputs)) b_q = torch.zeros(num_outputs, device=device) # 附加梯度 params = [w_xz, w_hz, b_z, w_xr, w_hr, b_r, w_xh, w_hh, b_h, w_hq, b_q] for param in params: param.requires_grad_(True) return params def init_gru_state(batch_size, num_hidden, device): return (torch.zeros((batch_size, num_hidden), device=device),) def gru(inputs, state, params): w_xz, w_hz, b_z, w_xr, w_hr, b_r, w_xh, w_hh, b_h, w_hq, b_q = params h, = state outputs = [] for x in inputs: z = torch.sigmoid((x @ w_xz) + (h @ w_hz) + b_z) # @ 是矩阵乘法 r = torch.sigmoid((x @ w_xr) + (h @ w_hr) + b_r) h_tilda = torch.tanh((x @ w_xh) + (r * h) @ w_hh + b_h) h = z * h + (1 - z) * h_tilda y = h @ w_hq + b_q outputs.append(y) return torch.cat(outputs, dim=0), (h,) vocab_size, num_hidden, device = len(vocab), 256, d2l.try_gpu() num_epochs, lr = 500, 1 model = d2l.RNNModelScratch(len(vocab), num_hidden, device, get_params, init_gru_state, gru) d2l.train_ch8(model, train_iter, vocab, lr, num_epochs, device)