From a87eb00b562b65a2d107d19fb77d1bd49c53d191 Mon Sep 17 00:00:00 2001 From: andyj29 Date: Tue, 11 Apr 2023 16:33:09 -0400 Subject: [PATCH] add single encoder layer --- transformer/common/__init__.py | 3 ++- transformer/common/attention.py | 18 +++++++------- transformer/common/ffn.py | 11 +++++---- transformer/common/position.py | 2 +- transformer/encoder/encoder.py | 43 ++++++++++++++++++++++++++++++--- 5 files changed, 58 insertions(+), 19 deletions(-) diff --git a/transformer/common/__init__.py b/transformer/common/__init__.py index 8a745db..cf64a15 100644 --- a/transformer/common/__init__.py +++ b/transformer/common/__init__.py @@ -1,4 +1,5 @@ from .attention import MultiHeadAttention from .ffn import FeedForwardNetwork from .embedding import Embeddings -from .position import PositionalEncoding \ No newline at end of file +from .position import PositionalEncoding +from .residual import ResidualConnection \ No newline at end of file diff --git a/transformer/common/attention.py b/transformer/common/attention.py index 6a28060..a850788 100644 --- a/transformer/common/attention.py +++ b/transformer/common/attention.py @@ -4,21 +4,21 @@ class MultiHeadAttention(nn.Module): - def __init__(self, cfg): + def __init__(self, d_model, n_head, dropout): super(MultiHeadAttention, self).__init__() # embedding dimension must be divisible by number of heads - assert cfg.emb_dim % cfg.n_head == 0 + assert d_model % n_head == 0 # key, query, value projections for all heads - self.c_attn = nn.Linear(cfg.emb_dim, 3 * cfg.emb_dim) + self.c_attn = nn.Linear(d_model, 3 * d_model) # output projection - self.c_proj = nn.Linear(cfg.emb_dim, cfg.emb_dim) + self.c_proj = nn.Linear(d_model, d_model) - self.n_head = cfg.n_head - self.emb_dim = cfg.emb_dim + self.n_head = n_head + self.d_model = d_model # regularization - self.dropout = cfg.dropout + self.dropout = dropout self.resid_dropout = nn.Dropout(self.dropout) @@ -26,13 +26,13 @@ def forward(self, x, mask=None): # B: batch size, S: sequence length, E: embedding dimension B, S, E = x.size() # pull out the query, key, value from the concatenated projection - q, k, v = self.c_attn(x).split(self.emb_dim, dim=2) + q, k, v = self.c_attn(x).split(self.d_model, dim=2) # split heads and transpose to (B, n_head, S, E // n_head) q = q.view(B, S, self.n_head, E // self.n_head).transpose(1, 2) k = k.view(B, S, self.n_head, E // self.n_head).transpose(1, 2) v = v.view(B, S, self.n_head, E // self.n_head).transpose(1, 2) # apply attention - y = F.scaled_dot_product_attention(q, k, v, dropout=self.attn_dropout, is_causal=mask) + y = F.scaled_dot_product_attention(q, k, v, dropout=self.dropout, is_causal=mask) # concatenate heads and transpose to (B, S, E) y = y.transpose(1, 2).contiguous().view(B, S, E) # apply drop out to final linear projection diff --git a/transformer/common/ffn.py b/transformer/common/ffn.py index 3a3c394..f56d4f0 100644 --- a/transformer/common/ffn.py +++ b/transformer/common/ffn.py @@ -1,13 +1,14 @@ import torch.nn as nn class FeedForwardNetwork(nn.Module): - def __init__(self, cfg): + def __init__(self, d_model, d_ffn_hidden, dropout): super(FeedForwardNetwork, self).__init__() self.layers = nn.ModuleList( - [nn.Linear(cfg.emb_dim, cfg.ffn_dim), - nn.ReLU(), - nn.Dropout(cfg.dropout), - nn.Linear(cfg.ffn_dim, cfg.emb_dim)] + [ nn.Linear(d_model, d_ffn_hidden), + nn.ReLU(), + nn.Dropout(dropout), + nn.Linear(d_ffn_hidden, d_model) + ] ) def forward(self, x): diff --git a/transformer/common/position.py b/transformer/common/position.py index 8b110c3..dbdd347 100644 --- a/transformer/common/position.py +++ b/transformer/common/position.py @@ -4,7 +4,7 @@ from torch.autograd import Variable class PositionalEncoding(nn.Module): - def __init__(self, d_model, dropout, max_len=5000): + def __init__(self, d_model, dropout, max_len): super(PositionalEncoding, self).__init__() self.dropout = nn.Dropout(p=dropout) diff --git a/transformer/encoder/encoder.py b/transformer/encoder/encoder.py index 818e402..73ddef4 100644 --- a/transformer/encoder/encoder.py +++ b/transformer/encoder/encoder.py @@ -1,10 +1,47 @@ import torch.nn as nn import torch.nn.functional as F -from transformer.common import MultiHeadAttention, FeedForwardNetwork - +from transformer.common import \ +( + MultiHeadAttention, + FeedForwardNetwork, + ResidualConnection, + Embeddings, + PositionalEncoding +) class EncoderLayer(nn.Module): - def __init__(self, attn, ffn, dropout): + def __init__(self, d_model, n_head, d_ffn_hidden, dropout=0.1): super(EncoderLayer, self).__init__() + self.attn = MultiHeadAttention(n_head, dropout) + self.ffn = FeedForwardNetwork(d_model, d_ffn_hidden) + self.residual = nn.ModuleList([ + ResidualConnection(self.attn), + ResidualConnection(self.ffn), + ]) + + def forward(self, x): + for layer in self.residual: + x = layer(x) + + return x + + + +class Encoder(nn.Module): + def __init__(self, d_model, n_stack, n_head, d_ffn_hidden, corpus_len, dropout): + self.layers = nn.ModuleList([ + EncoderLayer(d_model, n_head, d_ffn_hidden, dropout) + for _ in range(n_stack) + ] + ) + self.emb = Embeddings(d_model) + self.pos = PositionalEncoding(d_model, dropout, max_len=corpus_len) + self.dropout = nn.Dropout(dropout) + self.d_model = d_model + + + def forward(self, x): + pass + \ No newline at end of file