{"id":177202,"date":"2025-12-01T13:28:32","date_gmt":"2025-12-01T13:28:32","guid":{"rendered":"https:\/\/ktromedia.com\/?p=177202"},"modified":"2025-12-01T13:28:32","modified_gmt":"2025-12-01T13:28:32","slug":"fine-tuning-a-bert-model-machinelearningmastery-com","status":"publish","type":"post","link":"http:\/\/ktromedia.com\/?p=177202","title":{"rendered":"Fine-Tuning a BERT Model &#8211; MachineLearningMastery.com"},"content":{"rendered":"<div style=\"font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;\">\n<p><span class=\"crayon-e\">import <\/span><span class=\"crayon-e\">collections<\/span><\/p>\n<p><span class=\"crayon-e\">import <\/span><span class=\"crayon-e\">dataclasses<\/span><\/p>\n<p><span class=\"crayon-e\">import <\/span><span class=\"crayon-e\">functools<\/span><\/p>\n<p>\u00a0<\/p>\n<p><span class=\"crayon-e\">import <\/span><span class=\"crayon-e\">torch<\/span><\/p>\n<p><span class=\"crayon-e\">import <\/span><span class=\"crayon-v\">torch<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">nn <\/span><span class=\"crayon-st\">as<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">nn<\/span><\/p>\n<p><span class=\"crayon-e\">import <\/span><span class=\"crayon-v\">torch<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">optim <\/span><span class=\"crayon-st\">as<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">optim<\/span><\/p>\n<p><span class=\"crayon-e\">import <\/span><span class=\"crayon-e\">tqdm<\/span><\/p>\n<p><span class=\"crayon-e\">from <\/span><span class=\"crayon-e\">datasets <\/span><span class=\"crayon-e\">import <\/span><span class=\"crayon-e\">load_dataset<\/span><\/p>\n<p><span class=\"crayon-e\">from <\/span><span class=\"crayon-e\">tokenizers <\/span><span class=\"crayon-e\">import <\/span><span class=\"crayon-e\">Tokenizer<\/span><\/p>\n<p><span class=\"crayon-e\">from <\/span><span class=\"crayon-e\">torch <\/span><span class=\"crayon-e\">import <\/span><span class=\"crayon-i\">Tensor<\/span><\/p>\n<p>\u00a0<\/p>\n<p>\u00a0<\/p>\n<p><span class=\"crayon-p\"># BERT config and model defined previously<\/span><\/p>\n<p><span class=\"crayon-sy\">@<\/span><span class=\"crayon-v\">dataclasses<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">dataclass<\/span><\/p>\n<p><span class=\"crayon-t\">class<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">BertConfig<\/span><span class=\"crayon-o\">:<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-s\">&#8220;&#8221;<\/span><span class=\"crayon-s\">&#8220;Configuration for BERT model.&#8221;<\/span><span class=\"crayon-s\">&#8220;&#8221;<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">vocab_size<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-t\">int<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">30522<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">num_layers<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-t\">int<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">12<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">hidden_size<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-t\">int<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">768<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">num_heads<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-t\">int<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">12<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">dropout_prob<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-t\">float<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">0.1<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">pad_id<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-t\">int<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">0<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">max_seq_len<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-t\">int<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">512<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">num_types<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-t\">int<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">2<\/span><\/p>\n<p>\u00a0<\/p>\n<p><span class=\"crayon-t\">class<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">BertBlock<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">nn<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">Module<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-o\">:<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-s\">&#8220;&#8221;<\/span><span class=\"crayon-s\">&#8220;One transformer block in BERT.&#8221;<\/span><span class=\"crayon-s\">&#8220;&#8221;<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-e\">def <\/span><span class=\"crayon-e\">__init__<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">hidden_size<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-t\">int<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">num_heads<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-t\">int<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">dropout_prob<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-t\">float<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-o\">:<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-r\">super<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">__init__<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">attention<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">nn<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">MultiheadAttention<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">hidden_size<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">num_heads<\/span><span class=\"crayon-sy\">,<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 <\/span><span class=\"crayon-v\">dropout<\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-v\">dropout_prob<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">batch_first<\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-t\">True<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">attn_norm<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">nn<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">LayerNorm<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">hidden_size<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">ff_norm<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">nn<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">LayerNorm<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">hidden_size<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">dropout<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">nn<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">Dropout<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">dropout_prob<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">feed_forward<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">nn<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">Sequential<\/span><span class=\"crayon-sy\">(<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">nn<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">Linear<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">hidden_size<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">4<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">*<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">hidden_size<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-sy\">,<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">nn<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">GELU<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-sy\">,<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">nn<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">Linear<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-cn\">4<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">*<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">hidden_size<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">hidden_size<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-sy\">,<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p>\u00a0<\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-e\">def <\/span><span class=\"crayon-e\">forward<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">x<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">Tensor<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">pad_mask<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">Tensor<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">-&gt;<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">Tensor<\/span><span class=\"crayon-o\">:<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-p\"># self-attention with padding mask and post-norm<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">attn_output<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">_<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">attention<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">x<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">x<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">x<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">key_padding_mask<\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-v\">pad_mask<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">x<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">attn_norm<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">x<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">+<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">attn_output<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-p\"># feed-forward with GeLU activation and post-norm<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">ff_output<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">feed_forward<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">x<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">x<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">ff_norm<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">x<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">+<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">dropout<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">ff_output<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-st\">return<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-i\">x<\/span><\/p>\n<p>\u00a0<\/p>\n<p><span class=\"crayon-t\">class<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">BertPooler<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">nn<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">Module<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-o\">:<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-s\">&#8220;&#8221;<\/span><span class=\"crayon-s\">&#8220;Pooler layer for BERT to process the [CLS] token output.&#8221;<\/span><span class=\"crayon-s\">&#8220;&#8221;<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-e\">def <\/span><span class=\"crayon-e\">__init__<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">hidden_size<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-t\">int<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-o\">:<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-r\">super<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">__init__<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">dense<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">nn<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">Linear<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">hidden_size<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">hidden_size<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">activation<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">nn<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">Tanh<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p>\u00a0<\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-e\">def <\/span><span class=\"crayon-e\">forward<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">x<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">Tensor<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">-&gt;<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">Tensor<\/span><span class=\"crayon-o\">:<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">x<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">dense<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">x<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">x<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">activation<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">x<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-st\">return<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-i\">x<\/span><\/p>\n<p>\u00a0<\/p>\n<p><span class=\"crayon-t\">class<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">BertModel<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">nn<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">Module<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-o\">:<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-s\">&#8220;&#8221;<\/span><span class=\"crayon-s\">&#8220;Backbone of BERT model.&#8221;<\/span><span class=\"crayon-s\">&#8220;&#8221;<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-e\">def <\/span><span class=\"crayon-e\">__init__<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">config<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">BertConfig<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-o\">:<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-r\">super<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">__init__<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-p\"># embedding layers<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">word_embeddings<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">nn<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">Embedding<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">config<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">vocab_size<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">config<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">hidden_size<\/span><span class=\"crayon-sy\">,<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">padding_idx<\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-v\">config<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">pad_id<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">type_embeddings<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">nn<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">Embedding<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">config<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">num_types<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">config<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">hidden_size<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">position_embeddings<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">nn<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">Embedding<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">config<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">max_seq_len<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">config<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">hidden_size<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">embeddings_norm<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">nn<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">LayerNorm<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">config<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">hidden_size<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">embeddings_dropout<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">nn<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">Dropout<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">config<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">dropout_prob<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-p\"># transformer blocks<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">blocks<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">nn<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">ModuleList<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-sy\">[<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-e\">BertBlock<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">config<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">hidden_size<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">config<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">num_heads<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">config<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">dropout_prob<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-st\">for<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-i\">_<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-st\">in<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">range<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">config<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">num_layers<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-sy\">]<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-p\"># [CLS] pooler layer<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">pooler<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">BertPooler<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">config<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">hidden_size<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p>\u00a0<\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-e\">def <\/span><span class=\"crayon-e\">forward<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">input_ids<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">Tensor<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">token_type_ids<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">Tensor<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">pad_id<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-t\">int<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">0<\/span><span class=\"crayon-sy\">,<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">-&gt;<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">tuple<\/span><span class=\"crayon-sy\">[<\/span><span class=\"crayon-v\">Tensor<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">Tensor<\/span><span class=\"crayon-sy\">]<\/span><span class=\"crayon-o\">:<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-p\"># create attention mask for padding tokens<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">pad_mask<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">input_ids<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">==<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">pad<\/span><span class=\"crayon-sy\">_<\/span>id<\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-p\"># convert integer tokens to embedding vectors<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">batch_size<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">seq_len<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">input_ids<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">shape<\/span><\/p>\n<p><span class=\"crayon-e\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">position_ids<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">torch<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">arange<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">seq_len<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">device<\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-v\">input_ids<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">device<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">unsqueeze<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-cn\">0<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">position_embeddings<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">position_embeddings<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">position_ids<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">type_embeddings<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">type_embeddings<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">token_type_ids<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">token_embeddings<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">word_embeddings<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">input_ids<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">x<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">token_embeddings<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">+<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">type_embeddings<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">+<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">position<\/span><span class=\"crayon-sy\">_<\/span>embeddings<\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">x<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">embeddings_norm<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">x<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">x<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">embeddings_dropout<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">x<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-p\"># process the sequence with transformer blocks<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-st\">for<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">block <\/span><span class=\"crayon-st\">in<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">blocks<\/span><span class=\"crayon-o\">:<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">x<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">block<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">x<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">pad_mask<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-p\"># pool the hidden state of the `[CLS]` token<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">pooled_output<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">pooler<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">x<\/span><span class=\"crayon-sy\">[<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">0<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-sy\">]<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-st\">return<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">x<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">pooled<\/span><span class=\"crayon-sy\">_<\/span>output<\/p>\n<p>\u00a0<\/p>\n<p><span class=\"crayon-p\"># Define new BERT model for question answering<\/span><\/p>\n<p><span class=\"crayon-t\">class<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">BertForQuestionAnswering<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">nn<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">Module<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-o\">:<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-s\">&#8220;&#8221;<\/span><span class=\"crayon-s\">&#8220;BERT model for SQuAD question answering.&#8221;<\/span><span class=\"crayon-s\">&#8220;&#8221;<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-e\">def <\/span><span class=\"crayon-e\">__init__<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">config<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">BertConfig<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-o\">:<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-r\">super<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">__init__<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">bert<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">BertModel<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">config<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-p\"># Two outputs: start and end position logits<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">qa_outputs<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">nn<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">Linear<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">config<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">hidden_size<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">2<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p>\u00a0<\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-e\">def <\/span><span class=\"crayon-e\">forward<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">,<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">input_ids<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">Tensor<\/span><span class=\"crayon-sy\">,<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">token_type_ids<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">Tensor<\/span><span class=\"crayon-sy\">,<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">pad_id<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-t\">int<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">0<\/span><span class=\"crayon-sy\">,<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">-&gt;<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">tuple<\/span><span class=\"crayon-sy\">[<\/span><span class=\"crayon-v\">Tensor<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">Tensor<\/span><span class=\"crayon-sy\">]<\/span><span class=\"crayon-o\">:<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-p\"># Get sequence output from BERT (batch_size, seq_len, hidden_size)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">seq_output<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">pooled_output<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">bert<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">input_ids<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">token_type_ids<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">pad_id<\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-v\">pad_id<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-p\"># Project to start and end logits<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">logits<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-r\">self<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">qa_outputs<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">seq_output<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-h\">\u00a0\u00a0<\/span><span class=\"crayon-p\"># (batch_size, seq_len, 2)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">start_logits<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">logits<\/span><span class=\"crayon-sy\">[<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">0<\/span><span class=\"crayon-sy\">]<\/span><span class=\"crayon-h\">\u00a0\u00a0<\/span><span class=\"crayon-p\"># (batch_size, seq_len)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">end_logits<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">logits<\/span><span class=\"crayon-sy\">[<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">1<\/span><span class=\"crayon-sy\">]<\/span><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-p\"># (batch_size, seq_len)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-st\">return<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">start_logits<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">end<\/span><span class=\"crayon-sy\">_<\/span>logits<\/p>\n<p>\u00a0<\/p>\n<p><span class=\"crayon-p\"># Load SQuAD dataset for question answering<\/span><\/p>\n<p><span class=\"crayon-v\">dataset<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">load_dataset<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-s\">&#8220;squad&#8221;<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p>\u00a0<\/p>\n<p><span class=\"crayon-p\"># Load the pretrained BERT tokenizer<\/span><\/p>\n<p><span class=\"crayon-v\">TOKENIZER_PATH<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-s\">&#8220;wikitext-2_wordpiece.json&#8221;<\/span><\/p>\n<p><span class=\"crayon-v\">tokenizer<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">Tokenizer<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">from_file<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">TOKENIZER_PATH<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p>\u00a0<\/p>\n<p><span class=\"crayon-p\"># Setup collate function to tokenize question-context pairs for the model<\/span><\/p>\n<p><span class=\"crayon-e\">def <\/span><span class=\"crayon-e\">collate<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">batch<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">list<\/span><span class=\"crayon-sy\">[<\/span><span class=\"crayon-v\">dict<\/span><span class=\"crayon-sy\">]<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">tokenizer<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">Tokenizer<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">max_len<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-t\">int<\/span><span class=\"crayon-sy\">,<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">-&gt;<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">tuple<\/span><span class=\"crayon-sy\">[<\/span><span class=\"crayon-v\">Tensor<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">Tensor<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">Tensor<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">Tensor<\/span><span class=\"crayon-sy\">]<\/span><span class=\"crayon-o\">:<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-s\">&#8220;&#8221;<\/span><span class=\"crayon-s\">&#8220;Collate question-context pairs for the model.&#8221;<\/span><span class=\"crayon-s\">&#8220;&#8221;<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">cls_id<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">tokenizer<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">token_to_id<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-s\">&#8220;[CLS]&#8221;<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">sep_id<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">tokenizer<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">token_to_id<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-s\">&#8220;[SEP]&#8221;<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">pad_id<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">tokenizer<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">token_to_id<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-s\">&#8220;[PAD]&#8221;<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p>\u00a0<\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">input_ids_list<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-sy\">[<\/span><span class=\"crayon-sy\">]<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">token_type_ids_list<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-sy\">[<\/span><span class=\"crayon-sy\">]<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">start_positions<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-sy\">[<\/span><span class=\"crayon-sy\">]<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">end_positions<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-sy\">[<\/span><span class=\"crayon-sy\">]<\/span><\/p>\n<p>\u00a0<\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-st\">for<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">item <\/span><span class=\"crayon-st\">in<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">batch<\/span><span class=\"crayon-o\">:<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-p\"># Tokenize question and context<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">question<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">context<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">item<\/span><span class=\"crayon-sy\">[<\/span><span class=\"crayon-s\">&#8220;question&#8221;<\/span><span class=\"crayon-sy\">]<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">item<\/span><span class=\"crayon-sy\">[<\/span><span class=\"crayon-s\">&#8220;context&#8221;<\/span><span class=\"crayon-sy\">]<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">question_ids<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">tokenizer<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">encode<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">question<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">ids<\/span><\/p>\n<p><span class=\"crayon-e\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">context_ids<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">tokenizer<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">encode<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">context<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-i\">ids<\/span><\/p>\n<p>\u00a0<\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-p\"># Build input: [CLS] question [SEP] context [SEP]<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">input_ids<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-sy\">[<\/span><span class=\"crayon-v\">cls_id<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">*<\/span><span class=\"crayon-v\">question_ids<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">sep_id<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">*<\/span><span class=\"crayon-v\">context_ids<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">sep_id<\/span><span class=\"crayon-sy\">]<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">token_type_ids<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-sy\">[<\/span><span class=\"crayon-cn\">0<\/span><span class=\"crayon-sy\">]<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">*<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-e\">len<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">question_ids<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-o\">+<\/span><span class=\"crayon-cn\">2<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">+<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-sy\">[<\/span><span class=\"crayon-cn\">1<\/span><span class=\"crayon-sy\">]<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">*<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-e\">len<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">context_ids<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-o\">+<\/span><span class=\"crayon-cn\">1<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p>\u00a0<\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-p\"># Truncate or pad to max length<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-st\">if<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">len<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">input_ids<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">&gt;<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">max_len<\/span><span class=\"crayon-o\">:<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">input_ids<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">input_ids<\/span><span class=\"crayon-sy\">[<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-v\">max_len<\/span><span class=\"crayon-sy\">]<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">token_type_ids<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">token_type_ids<\/span><span class=\"crayon-sy\">[<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-v\">max_len<\/span><span class=\"crayon-sy\">]<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-st\">else<\/span><span class=\"crayon-o\">:<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">input_ids<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">extend<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-sy\">[<\/span><span class=\"crayon-v\">pad_id<\/span><span class=\"crayon-sy\">]<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">*<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">max_len<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">&#8211;<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">len<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">input_ids<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">token_type_ids<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">extend<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-sy\">[<\/span><span class=\"crayon-cn\">1<\/span><span class=\"crayon-sy\">]<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">*<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">max_len<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">&#8211;<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">len<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">token_type_ids<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p>\u00a0<\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-p\"># Find answer position in tokens: Answer may not be in the context<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">start_pos<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">end_pos<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">0<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-st\">if<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">len<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">item<\/span><span class=\"crayon-sy\">[<\/span><span class=\"crayon-s\">&#8220;answers&#8221;<\/span><span class=\"crayon-sy\">]<\/span><span class=\"crayon-sy\">[<\/span><span class=\"crayon-s\">&#8220;text&#8221;<\/span><span class=\"crayon-sy\">]<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">&gt;<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">0<\/span><span class=\"crayon-o\">:<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">answers<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">tokenizer<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">encode<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">item<\/span><span class=\"crayon-sy\">[<\/span><span class=\"crayon-s\">&#8220;answers&#8221;<\/span><span class=\"crayon-sy\">]<\/span><span class=\"crayon-sy\">[<\/span><span class=\"crayon-s\">&#8220;text&#8221;<\/span><span class=\"crayon-sy\">]<\/span><span class=\"crayon-sy\">[<\/span><span class=\"crayon-cn\">0<\/span><span class=\"crayon-sy\">]<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-i\">ids<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-p\"># find the context offset of the answer in context_ids<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-st\">for<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-i\">i<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-st\">in<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">range<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-e\">len<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">context_ids<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">&#8211;<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">len<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">answers<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">+<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">1<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-o\">:<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-st\">if<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">context_ids<\/span><span class=\"crayon-sy\">[<\/span><span class=\"crayon-v\">i<\/span><span class=\"crayon-o\">:<\/span><span class=\"crayon-v\">i<\/span><span class=\"crayon-o\">+<\/span><span class=\"crayon-e\">len<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">answers<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-sy\">]<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">==<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">answers<\/span><span class=\"crayon-o\">:<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">start_pos<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">i<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">+<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">len<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">question_ids<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">+<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">2<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">end_pos<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">start_pos<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">+<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">len<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">answers<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">&#8211;<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">1<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-st\">break<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-st\">if<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">end_pos<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">&gt;=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">max_len<\/span><span class=\"crayon-o\">:<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">start_pos<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">end_pos<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">0<\/span><span class=\"crayon-h\">\u00a0\u00a0<\/span><span class=\"crayon-p\"># answer is clipped, hence no answer<\/span><\/p>\n<p>\u00a0<\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">input_ids_list<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">append<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">input_ids<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">token_type_ids_list<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">append<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">token_type_ids<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">start_positions<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">append<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">start_pos<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">end_positions<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">append<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">end_pos<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p>\u00a0<\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">input_ids_list<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">torch<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">tensor<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">input_ids_list<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">token_type_ids_list<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">torch<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">tensor<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">token_type_ids_list<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">start_positions<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">torch<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">tensor<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">start_positions<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">end_positions<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">torch<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">tensor<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">end_positions<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-st\">return<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">input_ids_list<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">token_type_ids_list<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">start_positions<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">end_positions<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p>\u00a0<\/p>\n<p><span class=\"crayon-v\">batch_size<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">16<\/span><\/p>\n<p><span class=\"crayon-v\">max_len<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">384<\/span><span class=\"crayon-h\">\u00a0\u00a0<\/span><span class=\"crayon-p\"># Longer for Q&amp;A to accommodate context<\/span><\/p>\n<p><span class=\"crayon-v\">collate_fn<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">functools<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">partial<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">collate<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">tokenizer<\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-v\">tokenizer<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">max_len<\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-v\">max_len<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-v\">train_loader<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">torch<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">utils<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">data<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">DataLoader<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">dataset<\/span><span class=\"crayon-sy\">[<\/span><span class=\"crayon-s\">&#8220;train&#8221;<\/span><span class=\"crayon-sy\">]<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">batch_size<\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-v\">batch_size<\/span><span class=\"crayon-sy\">,<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 <\/span><span class=\"crayon-v\">shuffle<\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-t\">True<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">collate_fn<\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-v\">collate_fn<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-v\">val_loader<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">torch<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">utils<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">data<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">DataLoader<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">dataset<\/span><span class=\"crayon-sy\">[<\/span><span class=\"crayon-s\">&#8220;validation&#8221;<\/span><span class=\"crayon-sy\">]<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">batch_size<\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-v\">batch_size<\/span><span class=\"crayon-sy\">,<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 <\/span><span class=\"crayon-v\">shuffle<\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-t\">False<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">collate_fn<\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-v\">collate_fn<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p>\u00a0<\/p>\n<p><span class=\"crayon-p\"># Create Q&amp;A model with a pretrained foundation BERT model<\/span><\/p>\n<p><span class=\"crayon-v\">device<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">torch<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">device<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-s\">&#8220;cuda&#8221;<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-st\">if<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">torch<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">cuda<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">is_available<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-st\">else<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-s\">&#8220;cpu&#8221;<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-v\">config<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">BertConfig<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-v\">model<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">BertForQuestionAnswering<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">config<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-v\">model<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-st\">to<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">device<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-v\">model<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-v\">bert<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">load_state_dict<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">torch<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">load<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-s\">&#8220;bert_model.pth&#8221;<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">map_location<\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-v\">device<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p>\u00a0<\/p>\n<p><span class=\"crayon-p\"># Training setup<\/span><\/p>\n<p><span class=\"crayon-v\">loss_fn<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">nn<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">CrossEntropyLoss<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-v\">optimizer<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">optim<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">AdamW<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">model<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">parameters<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">lr<\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-cn\">2e<\/span><span class=\"crayon-o\">&#8211;<\/span><span class=\"crayon-cn\">5<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-v\">num_epochs<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">3<\/span><\/p>\n<p>\u00a0<\/p>\n<p><span class=\"crayon-st\">for<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">epoch <\/span><span class=\"crayon-st\">in<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">range<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">num_epochs<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-o\">:<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">model<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">train<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-p\"># Training<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-e\">with <\/span><span class=\"crayon-v\">tqdm<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">tqdm<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">train_loader<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">desc<\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-i\">f<\/span><span class=\"crayon-s\">&#8220;Epoch {epoch+1}\/{num_epochs}&#8221;<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-st\">as<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">pbar<\/span><span class=\"crayon-o\">:<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-st\">for<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">batch <\/span><span class=\"crayon-st\">in<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">pbar<\/span><span class=\"crayon-o\">:<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-p\"># get batched data<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">input_ids<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">token_type_ids<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">start_positions<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">end_positions<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">batch<\/span><\/p>\n<p><span class=\"crayon-e\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">input_ids<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">input_ids<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-st\">to<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">device<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">token_type_ids<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">token_type_ids<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-st\">to<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">device<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">start_positions<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">start_positions<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-st\">to<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">device<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">end_positions<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">end_positions<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-st\">to<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">device<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-p\"># forward pass<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">start_logits<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">end_logits<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">model<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">input_ids<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">token_type_ids<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-p\"># backward pass<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">optimizer<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">zero_grad<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">start_loss<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">loss_fn<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">start_logits<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">start_positions<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">end_loss<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">loss_fn<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">end_logits<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">end_positions<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">loss<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">start_loss<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">+<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">end_loss<\/span><\/p>\n<p><span class=\"crayon-e\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">loss<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">backward<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">optimizer<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">step<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-p\"># update progress bar<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">pbar<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">set_postfix<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">loss<\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-t\">float<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">loss<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">pbar<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">update<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-cn\">1<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p>\u00a0<\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-p\"># Validation: Keep track of the average loss and accuracy<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">model<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">eval<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">val_loss<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">num_matches<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">num_batches<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">num_samples<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">0<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">0<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">0<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">0<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-e\">with <\/span><span class=\"crayon-v\">torch<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">no_grad<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-o\">:<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-st\">for<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">batch <\/span><span class=\"crayon-st\">in<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">val_loader<\/span><span class=\"crayon-o\">:<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-p\"># get batched data<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">input_ids<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">token_type_ids<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">start_positions<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">end_positions<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">batch<\/span><\/p>\n<p><span class=\"crayon-e\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">input_ids<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">input_ids<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-st\">to<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">device<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">token_type_ids<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">token_type_ids<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-st\">to<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">device<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">start_positions<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">start_positions<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-st\">to<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">device<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">end_positions<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">end_positions<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-st\">to<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">device<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-p\"># forward pass on validation data<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">start_logits<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">end_logits<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">model<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">input_ids<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">token_type_ids<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-p\"># compute loss<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">start_loss<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">loss_fn<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">start_logits<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">start_positions<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">end_loss<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">loss_fn<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">end_logits<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">end_positions<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">loss<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">start_loss<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">+<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">end_loss<\/span><\/p>\n<p><span class=\"crayon-e\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">val_loss<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">+=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">loss<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">item<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">num_batches<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">+=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-cn\">1<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-p\"># compute accuracy<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">pred_start<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">start_logits<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">argmax<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">dim<\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-o\">&#8211;<\/span><span class=\"crayon-cn\">1<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">pred_end<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">end_logits<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">argmax<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">dim<\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-o\">&#8211;<\/span><span class=\"crayon-cn\">1<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">match<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">pred_start<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">==<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">start_positions<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">&amp;<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">pred_end<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">==<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">end_positions<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">num_matches<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">+=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">match<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">sum<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">item<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">num_samples<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">+=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">len<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">start_positions<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p>\u00a0<\/p>\n<p><span class=\"crayon-h\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">avg_loss<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">val_loss<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">\/<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">num_batches<\/span><\/p>\n<p><span class=\"crayon-e\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-v\">acc<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">=<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-v\">num_matches<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-o\">\/<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-e\">num_samples<\/span><\/p>\n<p><span class=\"crayon-e\">\u00a0\u00a0\u00a0\u00a0<\/span><span class=\"crayon-e\">print<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-i\">f<\/span><span class=\"crayon-s\">&#8220;Validation {epoch+1}\/{num_epochs}: acc {acc:.4f}, avg loss {avg_loss:.4f}&#8221;<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<p>\u00a0<\/p>\n<p><span class=\"crayon-p\"># Save the fine-tuned model<\/span><\/p>\n<p><span class=\"crayon-v\">torch<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">save<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-v\">model<\/span><span class=\"crayon-sy\">.<\/span><span class=\"crayon-e\">state_dict<\/span><span class=\"crayon-sy\">(<\/span><span class=\"crayon-sy\">)<\/span><span class=\"crayon-sy\">,<\/span><span class=\"crayon-h\"> <\/span><span class=\"crayon-i\">f<\/span><span class=\"crayon-s\">&#8220;bert_model_squad.pth&#8221;<\/span><span class=\"crayon-sy\">)<\/span><\/p>\n<\/div>\n","protected":false},"excerpt":{"rendered":"<p>import collections import dataclasses import functools \u00a0 import torch import torch.nn as nn import torch.optim as optim import tqdm from datasets import load_dataset from tokenizers import Tokenizer from torch import Tensor \u00a0 \u00a0 # BERT config and model defined previously @dataclasses.dataclass class BertConfig: \u00a0\u00a0\u00a0\u00a0&#8220;&#8221;&#8220;Configuration for BERT model.&#8221;&#8220;&#8221; \u00a0\u00a0\u00a0\u00a0vocab_size: int = 30522 \u00a0\u00a0\u00a0\u00a0num_layers: int = 12<\/p>\n","protected":false},"author":1,"featured_media":177203,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[42],"tags":[],"class_list":{"0":"post-177202","1":"post","2":"type-post","3":"status-publish","4":"format-standard","5":"has-post-thumbnail","7":"category-ai"},"yoast_head":"<!-- This site is optimized with the Yoast SEO plugin v26.4 - https:\/\/yoast.com\/wordpress\/plugins\/seo\/ -->\n<title>Fine-Tuning a BERT Model - MachineLearningMastery.com - Ktromedia<\/title>\n<meta name=\"robots\" content=\"index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1\" \/>\n<link rel=\"canonical\" href=\"http:\/\/ktromedia.com\/?p=177202\" \/>\n<meta property=\"og:locale\" content=\"en_US\" \/>\n<meta property=\"og:type\" content=\"article\" \/>\n<meta property=\"og:title\" content=\"Fine-Tuning a BERT Model - MachineLearningMastery.com - Ktromedia\" \/>\n<meta property=\"og:description\" content=\"import collections import dataclasses import functools \u00a0 import torch import torch.nn as nn import torch.optim as optim import tqdm from datasets import load_dataset from tokenizers import Tokenizer from torch import Tensor \u00a0 \u00a0 # BERT config and model defined previously @dataclasses.dataclass class BertConfig: \u00a0\u00a0\u00a0\u00a0&#8220;&#8221;&#8220;Configuration for BERT model.&#8221;&#8220;&#8221; \u00a0\u00a0\u00a0\u00a0vocab_size: int = 30522 \u00a0\u00a0\u00a0\u00a0num_layers: int = 12\" \/>\n<meta property=\"og:url\" content=\"http:\/\/ktromedia.com\/?p=177202\" \/>\n<meta property=\"og:site_name\" content=\"Ktromedia\" \/>\n<meta property=\"article:publisher\" content=\"https:\/\/www.facebook.com\/KTROMedia\/\" \/>\n<meta property=\"article:published_time\" content=\"2025-12-01T13:28:32+00:00\" \/>\n<meta property=\"og:image\" content=\"http:\/\/ktromedia.com\/wp-content\/uploads\/2025\/12\/Fine-Tuning-a-BERT-Model-MachineLearningMasterycom.jpg\" \/>\n\t<meta property=\"og:image:width\" content=\"2560\" \/>\n\t<meta property=\"og:image:height\" content=\"1707\" \/>\n\t<meta property=\"og:image:type\" content=\"image\/jpeg\" \/>\n<meta name=\"author\" content=\"KTRO TEAM\" \/>\n<meta name=\"twitter:card\" content=\"summary_large_image\" \/>\n<meta name=\"twitter:label1\" content=\"Written by\" \/>\n\t<meta name=\"twitter:data1\" content=\"KTRO TEAM\" \/>\n\t<meta name=\"twitter:label2\" content=\"Est. reading time\" \/>\n\t<meta name=\"twitter:data2\" content=\"7 minutes\" \/>\n<script type=\"application\/ld+json\" class=\"yoast-schema-graph\">{\"@context\":\"https:\/\/schema.org\",\"@graph\":[{\"@type\":\"Article\",\"@id\":\"http:\/\/ktromedia.com\/?p=177202#article\",\"isPartOf\":{\"@id\":\"http:\/\/ktromedia.com\/?p=177202\"},\"author\":{\"name\":\"KTRO TEAM\",\"@id\":\"http:\/\/ktromedia.com\/#\/schema\/person\/612bf2fbac107722ea365932cdd35f5b\"},\"headline\":\"Fine-Tuning a BERT Model &#8211; MachineLearningMastery.com\",\"datePublished\":\"2025-12-01T13:28:32+00:00\",\"mainEntityOfPage\":{\"@id\":\"http:\/\/ktromedia.com\/?p=177202\"},\"wordCount\":1387,\"commentCount\":0,\"publisher\":{\"@id\":\"http:\/\/ktromedia.com\/#organization\"},\"image\":{\"@id\":\"http:\/\/ktromedia.com\/?p=177202#primaryimage\"},\"thumbnailUrl\":\"http:\/\/ktromedia.com\/wp-content\/uploads\/2025\/12\/Fine-Tuning-a-BERT-Model-MachineLearningMasterycom.jpg\",\"articleSection\":[\"\u4eba\u5de5\u667a\u80fd\"],\"inLanguage\":\"en-US\",\"potentialAction\":[{\"@type\":\"CommentAction\",\"name\":\"Comment\",\"target\":[\"http:\/\/ktromedia.com\/?p=177202#respond\"]}]},{\"@type\":\"WebPage\",\"@id\":\"http:\/\/ktromedia.com\/?p=177202\",\"url\":\"http:\/\/ktromedia.com\/?p=177202\",\"name\":\"Fine-Tuning a BERT Model - MachineLearningMastery.com - Ktromedia\",\"isPartOf\":{\"@id\":\"http:\/\/ktromedia.com\/#website\"},\"primaryImageOfPage\":{\"@id\":\"http:\/\/ktromedia.com\/?p=177202#primaryimage\"},\"image\":{\"@id\":\"http:\/\/ktromedia.com\/?p=177202#primaryimage\"},\"thumbnailUrl\":\"http:\/\/ktromedia.com\/wp-content\/uploads\/2025\/12\/Fine-Tuning-a-BERT-Model-MachineLearningMasterycom.jpg\",\"datePublished\":\"2025-12-01T13:28:32+00:00\",\"breadcrumb\":{\"@id\":\"http:\/\/ktromedia.com\/?p=177202#breadcrumb\"},\"inLanguage\":\"en-US\",\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":[\"http:\/\/ktromedia.com\/?p=177202\"]}]},{\"@type\":\"ImageObject\",\"inLanguage\":\"en-US\",\"@id\":\"http:\/\/ktromedia.com\/?p=177202#primaryimage\",\"url\":\"http:\/\/ktromedia.com\/wp-content\/uploads\/2025\/12\/Fine-Tuning-a-BERT-Model-MachineLearningMasterycom.jpg\",\"contentUrl\":\"http:\/\/ktromedia.com\/wp-content\/uploads\/2025\/12\/Fine-Tuning-a-BERT-Model-MachineLearningMasterycom.jpg\",\"width\":2560,\"height\":1707},{\"@type\":\"BreadcrumbList\",\"@id\":\"http:\/\/ktromedia.com\/?p=177202#breadcrumb\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"Home\",\"item\":\"https:\/\/ktromedia.com\/\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"Fine-Tuning a BERT Model &#8211; MachineLearningMastery.com\"}]},{\"@type\":\"WebSite\",\"@id\":\"http:\/\/ktromedia.com\/#website\",\"url\":\"http:\/\/ktromedia.com\/\",\"name\":\"Ktromedia\",\"description\":\"KTRO MEDIA Crypto News\",\"publisher\":{\"@id\":\"http:\/\/ktromedia.com\/#organization\"},\"potentialAction\":[{\"@type\":\"SearchAction\",\"target\":{\"@type\":\"EntryPoint\",\"urlTemplate\":\"http:\/\/ktromedia.com\/?s={search_term_string}\"},\"query-input\":{\"@type\":\"PropertyValueSpecification\",\"valueRequired\":true,\"valueName\":\"search_term_string\"}}],\"inLanguage\":\"en-US\"},{\"@type\":\"Organization\",\"@id\":\"http:\/\/ktromedia.com\/#organization\",\"name\":\"Ktromedia\",\"url\":\"http:\/\/ktromedia.com\/\",\"logo\":{\"@type\":\"ImageObject\",\"inLanguage\":\"en-US\",\"@id\":\"http:\/\/ktromedia.com\/#\/schema\/logo\/image\/\",\"url\":\"https:\/\/ktromedia.com\/wp-content\/uploads\/2025\/11\/ktroicon.png\",\"contentUrl\":\"https:\/\/ktromedia.com\/wp-content\/uploads\/2025\/11\/ktroicon.png\",\"width\":250,\"height\":250,\"caption\":\"Ktromedia\"},\"image\":{\"@id\":\"http:\/\/ktromedia.com\/#\/schema\/logo\/image\/\"},\"sameAs\":[\"https:\/\/www.facebook.com\/KTROMedia\/\",\"https:\/\/www.linkedin.com\/company\/ktro-media\/\",\"https:\/\/t.me\/ktrogroup\"]},{\"@type\":\"Person\",\"@id\":\"http:\/\/ktromedia.com\/#\/schema\/person\/612bf2fbac107722ea365932cdd35f5b\",\"name\":\"KTRO TEAM\",\"image\":{\"@type\":\"ImageObject\",\"inLanguage\":\"en-US\",\"@id\":\"http:\/\/ktromedia.com\/#\/schema\/person\/image\/\",\"url\":\"http:\/\/ktromedia.com\/wp-content\/uploads\/2025\/10\/cropped-Untitled-design-7-1-150x150.png\",\"contentUrl\":\"http:\/\/ktromedia.com\/wp-content\/uploads\/2025\/10\/cropped-Untitled-design-7-1-150x150.png\",\"caption\":\"KTRO TEAM\"},\"description\":\"KTRO MEDIA \u662f\u4e00\u5bb6\u5168\u7403\u6027\u7684\u534e\u6587WEB3\u5a92\u4f53\u516c\u53f8\u3002\u6211\u4eec\u81f4\u529b\u4e8e\u4e3a\u533a\u5757\u94fe\u548c\u91d1\u878d\u79d1\u6280\u9886\u57df\u63d0\u4f9b\u6700\u65b0\u7684\u65b0\u95fb\u3001\u89c1\u89e3\u548c\u8d8b\u52bf\u5206\u6790\u3002\u6211\u4eec\u7684\u5b97\u65e8\u662f\u4e3a\u5168\u7403\u7528\u6237\u63d0\u4f9b\u9ad8\u8d28\u91cf\u3001\u5168\u9762\u7684\u8d44\u8baf\u670d\u52a1\uff0c\u8ba9\u4ed6\u4eec\u66f4\u597d\u5730\u4e86\u89e3\u533a\u5757\u94fe\u548c\u91d1\u878d\u79d1\u6280\u884c\u4e1a\u7684\u6700\u65b0\u52a8\u6001\u3002\u6211\u4eec\u4e5f\u5e0c\u671b\u80fd\u5e2e\u5230\u66f4\u591a\u4f18\u79c0\u7684WEB3\u4ea7\u54c1\u627e\u5230\u66f4\u591a\u66f4\u597d\u7684\u8d44\u6e90\u597d\u8ba9\u8fd9\u9886\u57df\u53d8\u5f97\u66f4\u6210\u719f\u3002 \u6211\u4eec\u7684\u62a5\u9053\u8303\u56f4\u6db5\u76d6\u4e86\u533a\u5757\u94fe\u3001\u52a0\u5bc6\u8d27\u5e01\u3001\u667a\u80fd\u5408\u7ea6\u3001DeFi\u3001NFT \u548c Web3 \u751f\u6001\u7cfb\u7edf\u7b49\u9886\u57df\u3002\u6211\u4eec\u7684\u62a5\u9053\u4e0d\u4ec5\u6765\u81ea\u884c\u4e1a\u5185\u7684\u4e13\u5bb6\uff0c\u5148\u950b\u8005\u4e5f\u5305\u62ec\u4e86\u6211\u4eec\u81ea\u5df1\u7684\u5206\u6790\u548c\u89c2\u70b9\u3002\u6211\u4eec\u5728\u5404\u4e2a\u56fd\u5bb6\u548c\u5730\u533a\u90fd\u8bbe\u6709\u56e2\u961f\uff0c\u4e3a\u8bfb\u8005\u63d0\u4f9b\u672c\u5730\u5316\u7684\u62a5\u9053\u548c\u5206\u6790\u3002 \u9664\u4e86\u65b0\u95fb\u62a5\u9053\uff0c\u6211\u4eec\u8fd8\u63d0\u4f9b\u5e02\u573a\u7814\u7a76\u548c\u54a8\u8be2\u670d\u52a1\u3002\u6211\u4eec\u7684\u4e13\u4e1a\u56e2\u961f\u53ef\u4ee5\u4e3a\u60a8\u63d0\u4f9b\u6709\u5173\u533a\u5757\u94fe\u548c\u91d1\u878d\u79d1\u6280\u884c\u4e1a\u7684\u6df1\u5165\u5206\u6790\u548c\u5e02\u573a\u8d8b\u52bf\uff0c\u5e2e\u52a9\u60a8\u505a\u51fa\u66f4\u660e\u667a\u7684\u6295\u8d44\u51b3\u7b56\u3002 \u6211\u4eec\u7684\u4f7f\u547d\u662f\u6210\u4e3a\u5168\u7403\u534e\u6587\u533a\u5757\u94fe\u548c\u91d1\u878d\u79d1\u6280\u884c\u4e1a\u6700\u53d7\u4fe1\u8d56\u7684\u4fe1\u606f\u6765\u6e90\u4e4b\u4e00\u3002\u6211\u4eec\u5c06\u7ee7\u7eed\u4e0d\u65ad\u52aa\u529b\uff0c\u4e3a\u8bfb\u8005\u63d0\u4f9b\u6700\u65b0\u3001\u6700\u5168\u9762\u3001\u6700\u53ef\u9760\u7684\u4fe1\u606f\u670d\u52a1\u3002\",\"sameAs\":[\"https:\/\/ktromedia.com\"],\"url\":\"http:\/\/ktromedia.com\/?author=1\"}]}<\/script>\n<!-- \/ Yoast SEO plugin. -->","yoast_head_json":{"title":"Fine-Tuning a BERT Model - MachineLearningMastery.com - Ktromedia","robots":{"index":"index","follow":"follow","max-snippet":"max-snippet:-1","max-image-preview":"max-image-preview:large","max-video-preview":"max-video-preview:-1"},"canonical":"http:\/\/ktromedia.com\/?p=177202","og_locale":"en_US","og_type":"article","og_title":"Fine-Tuning a BERT Model - MachineLearningMastery.com - Ktromedia","og_description":"import collections import dataclasses import functools \u00a0 import torch import torch.nn as nn import torch.optim as optim import tqdm from datasets import load_dataset from tokenizers import Tokenizer from torch import Tensor \u00a0 \u00a0 # BERT config and model defined previously @dataclasses.dataclass class BertConfig: \u00a0\u00a0\u00a0\u00a0&#8220;&#8221;&#8220;Configuration for BERT model.&#8221;&#8220;&#8221; \u00a0\u00a0\u00a0\u00a0vocab_size: int = 30522 \u00a0\u00a0\u00a0\u00a0num_layers: int = 12","og_url":"http:\/\/ktromedia.com\/?p=177202","og_site_name":"Ktromedia","article_publisher":"https:\/\/www.facebook.com\/KTROMedia\/","article_published_time":"2025-12-01T13:28:32+00:00","og_image":[{"width":2560,"height":1707,"url":"http:\/\/ktromedia.com\/wp-content\/uploads\/2025\/12\/Fine-Tuning-a-BERT-Model-MachineLearningMasterycom.jpg","type":"image\/jpeg"}],"author":"KTRO TEAM","twitter_card":"summary_large_image","twitter_misc":{"Written by":"KTRO TEAM","Est. reading time":"7 minutes"},"schema":{"@context":"https:\/\/schema.org","@graph":[{"@type":"Article","@id":"http:\/\/ktromedia.com\/?p=177202#article","isPartOf":{"@id":"http:\/\/ktromedia.com\/?p=177202"},"author":{"name":"KTRO TEAM","@id":"http:\/\/ktromedia.com\/#\/schema\/person\/612bf2fbac107722ea365932cdd35f5b"},"headline":"Fine-Tuning a BERT Model &#8211; MachineLearningMastery.com","datePublished":"2025-12-01T13:28:32+00:00","mainEntityOfPage":{"@id":"http:\/\/ktromedia.com\/?p=177202"},"wordCount":1387,"commentCount":0,"publisher":{"@id":"http:\/\/ktromedia.com\/#organization"},"image":{"@id":"http:\/\/ktromedia.com\/?p=177202#primaryimage"},"thumbnailUrl":"http:\/\/ktromedia.com\/wp-content\/uploads\/2025\/12\/Fine-Tuning-a-BERT-Model-MachineLearningMasterycom.jpg","articleSection":["\u4eba\u5de5\u667a\u80fd"],"inLanguage":"en-US","potentialAction":[{"@type":"CommentAction","name":"Comment","target":["http:\/\/ktromedia.com\/?p=177202#respond"]}]},{"@type":"WebPage","@id":"http:\/\/ktromedia.com\/?p=177202","url":"http:\/\/ktromedia.com\/?p=177202","name":"Fine-Tuning a BERT Model - MachineLearningMastery.com - Ktromedia","isPartOf":{"@id":"http:\/\/ktromedia.com\/#website"},"primaryImageOfPage":{"@id":"http:\/\/ktromedia.com\/?p=177202#primaryimage"},"image":{"@id":"http:\/\/ktromedia.com\/?p=177202#primaryimage"},"thumbnailUrl":"http:\/\/ktromedia.com\/wp-content\/uploads\/2025\/12\/Fine-Tuning-a-BERT-Model-MachineLearningMasterycom.jpg","datePublished":"2025-12-01T13:28:32+00:00","breadcrumb":{"@id":"http:\/\/ktromedia.com\/?p=177202#breadcrumb"},"inLanguage":"en-US","potentialAction":[{"@type":"ReadAction","target":["http:\/\/ktromedia.com\/?p=177202"]}]},{"@type":"ImageObject","inLanguage":"en-US","@id":"http:\/\/ktromedia.com\/?p=177202#primaryimage","url":"http:\/\/ktromedia.com\/wp-content\/uploads\/2025\/12\/Fine-Tuning-a-BERT-Model-MachineLearningMasterycom.jpg","contentUrl":"http:\/\/ktromedia.com\/wp-content\/uploads\/2025\/12\/Fine-Tuning-a-BERT-Model-MachineLearningMasterycom.jpg","width":2560,"height":1707},{"@type":"BreadcrumbList","@id":"http:\/\/ktromedia.com\/?p=177202#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"Home","item":"https:\/\/ktromedia.com\/"},{"@type":"ListItem","position":2,"name":"Fine-Tuning a BERT Model &#8211; MachineLearningMastery.com"}]},{"@type":"WebSite","@id":"http:\/\/ktromedia.com\/#website","url":"http:\/\/ktromedia.com\/","name":"Ktromedia","description":"KTRO MEDIA Crypto News","publisher":{"@id":"http:\/\/ktromedia.com\/#organization"},"potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"http:\/\/ktromedia.com\/?s={search_term_string}"},"query-input":{"@type":"PropertyValueSpecification","valueRequired":true,"valueName":"search_term_string"}}],"inLanguage":"en-US"},{"@type":"Organization","@id":"http:\/\/ktromedia.com\/#organization","name":"Ktromedia","url":"http:\/\/ktromedia.com\/","logo":{"@type":"ImageObject","inLanguage":"en-US","@id":"http:\/\/ktromedia.com\/#\/schema\/logo\/image\/","url":"https:\/\/ktromedia.com\/wp-content\/uploads\/2025\/11\/ktroicon.png","contentUrl":"https:\/\/ktromedia.com\/wp-content\/uploads\/2025\/11\/ktroicon.png","width":250,"height":250,"caption":"Ktromedia"},"image":{"@id":"http:\/\/ktromedia.com\/#\/schema\/logo\/image\/"},"sameAs":["https:\/\/www.facebook.com\/KTROMedia\/","https:\/\/www.linkedin.com\/company\/ktro-media\/","https:\/\/t.me\/ktrogroup"]},{"@type":"Person","@id":"http:\/\/ktromedia.com\/#\/schema\/person\/612bf2fbac107722ea365932cdd35f5b","name":"KTRO TEAM","image":{"@type":"ImageObject","inLanguage":"en-US","@id":"http:\/\/ktromedia.com\/#\/schema\/person\/image\/","url":"http:\/\/ktromedia.com\/wp-content\/uploads\/2025\/10\/cropped-Untitled-design-7-1-150x150.png","contentUrl":"http:\/\/ktromedia.com\/wp-content\/uploads\/2025\/10\/cropped-Untitled-design-7-1-150x150.png","caption":"KTRO TEAM"},"description":"KTRO MEDIA \u662f\u4e00\u5bb6\u5168\u7403\u6027\u7684\u534e\u6587WEB3\u5a92\u4f53\u516c\u53f8\u3002\u6211\u4eec\u81f4\u529b\u4e8e\u4e3a\u533a\u5757\u94fe\u548c\u91d1\u878d\u79d1\u6280\u9886\u57df\u63d0\u4f9b\u6700\u65b0\u7684\u65b0\u95fb\u3001\u89c1\u89e3\u548c\u8d8b\u52bf\u5206\u6790\u3002\u6211\u4eec\u7684\u5b97\u65e8\u662f\u4e3a\u5168\u7403\u7528\u6237\u63d0\u4f9b\u9ad8\u8d28\u91cf\u3001\u5168\u9762\u7684\u8d44\u8baf\u670d\u52a1\uff0c\u8ba9\u4ed6\u4eec\u66f4\u597d\u5730\u4e86\u89e3\u533a\u5757\u94fe\u548c\u91d1\u878d\u79d1\u6280\u884c\u4e1a\u7684\u6700\u65b0\u52a8\u6001\u3002\u6211\u4eec\u4e5f\u5e0c\u671b\u80fd\u5e2e\u5230\u66f4\u591a\u4f18\u79c0\u7684WEB3\u4ea7\u54c1\u627e\u5230\u66f4\u591a\u66f4\u597d\u7684\u8d44\u6e90\u597d\u8ba9\u8fd9\u9886\u57df\u53d8\u5f97\u66f4\u6210\u719f\u3002 \u6211\u4eec\u7684\u62a5\u9053\u8303\u56f4\u6db5\u76d6\u4e86\u533a\u5757\u94fe\u3001\u52a0\u5bc6\u8d27\u5e01\u3001\u667a\u80fd\u5408\u7ea6\u3001DeFi\u3001NFT \u548c Web3 \u751f\u6001\u7cfb\u7edf\u7b49\u9886\u57df\u3002\u6211\u4eec\u7684\u62a5\u9053\u4e0d\u4ec5\u6765\u81ea\u884c\u4e1a\u5185\u7684\u4e13\u5bb6\uff0c\u5148\u950b\u8005\u4e5f\u5305\u62ec\u4e86\u6211\u4eec\u81ea\u5df1\u7684\u5206\u6790\u548c\u89c2\u70b9\u3002\u6211\u4eec\u5728\u5404\u4e2a\u56fd\u5bb6\u548c\u5730\u533a\u90fd\u8bbe\u6709\u56e2\u961f\uff0c\u4e3a\u8bfb\u8005\u63d0\u4f9b\u672c\u5730\u5316\u7684\u62a5\u9053\u548c\u5206\u6790\u3002 \u9664\u4e86\u65b0\u95fb\u62a5\u9053\uff0c\u6211\u4eec\u8fd8\u63d0\u4f9b\u5e02\u573a\u7814\u7a76\u548c\u54a8\u8be2\u670d\u52a1\u3002\u6211\u4eec\u7684\u4e13\u4e1a\u56e2\u961f\u53ef\u4ee5\u4e3a\u60a8\u63d0\u4f9b\u6709\u5173\u533a\u5757\u94fe\u548c\u91d1\u878d\u79d1\u6280\u884c\u4e1a\u7684\u6df1\u5165\u5206\u6790\u548c\u5e02\u573a\u8d8b\u52bf\uff0c\u5e2e\u52a9\u60a8\u505a\u51fa\u66f4\u660e\u667a\u7684\u6295\u8d44\u51b3\u7b56\u3002 \u6211\u4eec\u7684\u4f7f\u547d\u662f\u6210\u4e3a\u5168\u7403\u534e\u6587\u533a\u5757\u94fe\u548c\u91d1\u878d\u79d1\u6280\u884c\u4e1a\u6700\u53d7\u4fe1\u8d56\u7684\u4fe1\u606f\u6765\u6e90\u4e4b\u4e00\u3002\u6211\u4eec\u5c06\u7ee7\u7eed\u4e0d\u65ad\u52aa\u529b\uff0c\u4e3a\u8bfb\u8005\u63d0\u4f9b\u6700\u65b0\u3001\u6700\u5168\u9762\u3001\u6700\u53ef\u9760\u7684\u4fe1\u606f\u670d\u52a1\u3002","sameAs":["https:\/\/ktromedia.com"],"url":"http:\/\/ktromedia.com\/?author=1"}]}},"_links":{"self":[{"href":"http:\/\/ktromedia.com\/index.php?rest_route=\/wp\/v2\/posts\/177202","targetHints":{"allow":["GET"]}}],"collection":[{"href":"http:\/\/ktromedia.com\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/ktromedia.com\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/ktromedia.com\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/ktromedia.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=177202"}],"version-history":[{"count":1,"href":"http:\/\/ktromedia.com\/index.php?rest_route=\/wp\/v2\/posts\/177202\/revisions"}],"predecessor-version":[{"id":177204,"href":"http:\/\/ktromedia.com\/index.php?rest_route=\/wp\/v2\/posts\/177202\/revisions\/177204"}],"wp:featuredmedia":[{"embeddable":true,"href":"http:\/\/ktromedia.com\/index.php?rest_route=\/wp\/v2\/media\/177203"}],"wp:attachment":[{"href":"http:\/\/ktromedia.com\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=177202"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/ktromedia.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=177202"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/ktromedia.com\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=177202"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}