From 3bfd0684c0a41b23a41723daf71627ea1b59a52a Mon Sep 17 00:00:00 2001 From: Joe McCain III Date: Sat, 25 May 2024 09:59:30 -0500 Subject: [PATCH] update Signed-off-by: Joe McCain III --- core/src/nn/dropout.rs | 25 ++++++---- core/tests/nn.rs | 4 +- .../src/attention/{multi => }/config.rs | 18 +++++--- models/transformers/src/attention/head.rs | 23 +++++----- models/transformers/src/attention/mod.rs | 12 ++--- .../transformers/src/attention/multi/mod.rs | 5 +- .../src/attention/multi/multi_head.rs | 11 ++--- models/transformers/src/attention/score.rs | 1 - models/transformers/src/config/mod.rs | 46 +++++++++++++++++++ models/transformers/src/ffn/mod.rs | 7 --- models/transformers/src/lib.rs | 11 ++--- .../src/{ffn/model.rs => model/ffn.rs} | 34 ++++++++------ models/transformers/src/model/mod.rs | 1 + models/transformers/src/model/sublayer.rs | 8 ++-- models/transformers/src/params/mod.rs | 8 ++-- .../src/params/{store.rs => qkv.rs} | 10 +++- models/transformers/tests/ffn.rs | 4 +- models/transformers/tests/ops.rs | 2 +- 18 files changed, 145 insertions(+), 85 deletions(-) rename models/transformers/src/attention/{multi => }/config.rs (66%) create mode 100644 models/transformers/src/config/mod.rs delete mode 100644 models/transformers/src/ffn/mod.rs rename models/transformers/src/{ffn/model.rs => model/ffn.rs} (63%) rename models/transformers/src/params/{store.rs => qkv.rs} (93%) diff --git a/core/src/nn/dropout.rs b/core/src/nn/dropout.rs index 19acdbc..772d2fa 100644 --- a/core/src/nn/dropout.rs +++ b/core/src/nn/dropout.rs @@ -11,7 +11,7 @@ use ndrand::{rand_distr::Bernoulli, RandomExt}; use num::traits::Num; #[cfg(feature = "rand")] -pub fn dropout(array: &ArrayBase, p: f64) -> Array +pub(crate) fn _dropout(array: &ArrayBase, p: f64) -> Array where A: Num + ScalarOperand, D: Dimension, @@ -29,7 +29,7 @@ where } /// [Dropout] randomly zeroizes elements with a given probability (`p`). -pub trait Dropout { +pub trait DropOut { type Output; fn dropout(&self, p: f64) -> Self::Output; @@ -44,7 +44,7 @@ pub trait Dropout { /// - (p) Probability of dropping an element #[derive(Clone, Copy, Debug, PartialEq, PartialOrd)] #[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] -pub struct DropoutLayer { +pub struct Dropout { pub(crate) p: f64, } @@ -52,7 +52,7 @@ pub struct DropoutLayer { ************* Implementations ************* */ #[cfg(feature = "rand")] -impl Dropout for ArrayBase +impl DropOut for ArrayBase where A: Num + ScalarOperand, D: Dimension, @@ -61,28 +61,37 @@ where type Output = Array; fn dropout(&self, p: f64) -> Self::Output { - dropout(self, p) + _dropout(self, p) } } -impl DropoutLayer { +impl Dropout { pub fn new(p: f64) -> Self { Self { p } } + pub fn apply(&self, input: &ArrayBase) -> Array + where + A: Num + ScalarOperand, + D: Dimension, + S: DataOwned, + { + _dropout(input, self.p) + } + pub fn scale(&self) -> f64 { (1f64 - self.p).recip() } } -impl Default for DropoutLayer { +impl Default for Dropout { fn default() -> Self { Self::new(0.5) } } #[cfg(feature = "rand")] -impl Forward> for DropoutLayer +impl Forward> for Dropout where A: Num + ScalarOperand, D: Dimension, diff --git a/core/tests/nn.rs b/core/tests/nn.rs index 55b5119..e6ba7e5 100644 --- a/core/tests/nn.rs +++ b/core/tests/nn.rs @@ -1,7 +1,7 @@ #![allow(unused_imports)] extern crate concision_core as concision; -use concision::nn::DropoutLayer; +use concision::nn::Dropout; use concision::Forward; use ndarray::prelude::*; @@ -10,7 +10,7 @@ use ndarray::prelude::*; fn test_dropout() { let shape = (512, 2048); let arr = Array2::::ones(shape); - let dropout = DropoutLayer::new(0.5); + let dropout = Dropout::new(0.5); let out = dropout.forward(&arr); assert!(arr.iter().all(|&x| x == 1.0)); diff --git a/models/transformers/src/attention/multi/config.rs b/models/transformers/src/attention/config.rs similarity index 66% rename from models/transformers/src/attention/multi/config.rs rename to models/transformers/src/attention/config.rs index 58c510c..18b7820 100644 --- a/models/transformers/src/attention/multi/config.rs +++ b/models/transformers/src/attention/config.rs @@ -9,13 +9,17 @@ pub(crate) fn dk(d_model: usize, heads: usize) -> usize { #[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] #[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] -pub struct Config { - pub d_model: usize, - pub heads: usize, +pub struct AttentionConfig { + pub d_model: usize, // embedding size; default is 512 + pub heads: usize, // number of heads; default is 8 } -impl Config { - pub fn new() -> ConfigBuilder { +impl AttentionConfig { + pub fn new(d_model: usize, heads: usize) -> Self { + Self { d_model, heads } + } + /// + pub fn create() -> ConfigBuilder { ConfigBuilder::new() } @@ -32,7 +36,7 @@ impl Config { } } -impl Default for Config { +impl Default for AttentionConfig { fn default() -> Self { Self { d_model: crate::D_MODEL, @@ -42,7 +46,7 @@ impl Default for Config { } concision::builder! { - ConfigBuilder(Config) { + ConfigBuilder(AttentionConfig) { d_model: usize, heads: usize, } diff --git a/models/transformers/src/attention/head.rs b/models/transformers/src/attention/head.rs index e80fdda..76e69f0 100644 --- a/models/transformers/src/attention/head.rs +++ b/models/transformers/src/attention/head.rs @@ -5,7 +5,7 @@ use super::{Score, _attention}; use crate::params::QkvBase; use concision::getters; -use concision::nn::DropoutLayer; +use concision::nn::Dropout; use nd::linalg::Dot; use nd::*; use num::complex::ComplexFloat; @@ -16,13 +16,14 @@ use num::complex::ComplexFloat; /// be flexible, relying upon the n-dimensional [QkvBase] to store the query, key, and value tensors. /// More so, the head may be configured with an optional dropout and/or masking layers. /// -/// ### Dropout +/// ### `dropout` /// -/// The [DropoutLayer] is an optional layer applied after the softmax function is applied to the -/// score. The layer is used to prevent overfitting by randomly setting a fraction of the input +/// The [Dropout] layer is an optional, conditionally enabled layer (required the `rand` feature). +/// If enabled, the dropout layer is invoked after the softmax function is applied to the score. +/// The layer is used to prevent overfitting by randomly setting a fraction of the input /// units to zero at each update during training time. /// -/// ### Masking +/// ### `mask` /// /// After computing the dot-product of the query and key tensors, an optional mask may be applied to /// the attention score. The mask is used to prevent the model from attending to certain parts of the @@ -34,7 +35,7 @@ where S: RawData, { #[cfg(feature = "rand")] - pub(crate) dropout: Option, + pub(crate) dropout: Option, pub(crate) mask: Option>, pub(crate) params: QkvBase, } @@ -48,7 +49,7 @@ where A: Default, S: DataOwned, { - Self::from_params(QkvBase::new((dk, dm))) + Self::from_params(QkvBase::std(dk, dm)) } } @@ -115,7 +116,7 @@ where } /// Sets the dropout layer for the [AttentionHead] #[cfg(feature = "rand")] - pub fn set_dropout(&mut self, dropout: Option) { + pub fn set_dropout(&mut self, dropout: Option) { self.dropout = dropout; } /// Sets the mask for the [AttentionHead] @@ -124,7 +125,7 @@ where } /// Configure the [AttentionHead] with a [DropoutLayer] #[cfg(feature = "rand")] - pub fn with_dropout(self, dropout: DropoutLayer) -> Self { + pub fn with_dropout(self, dropout: Dropout) -> Self { Self { dropout: Some(dropout), ..self @@ -153,7 +154,7 @@ where /// Returns an immutable reference to the, optional, [dropout](DropoutLayer) layer. /// With the `rand` feature flag disabled, the dropout layer is /// unavailable and returns `None`. - pub fn dropout(&self) -> Option<&DropoutLayer> { + pub fn dropout(&self) -> Option<&Dropout> { self.dropout.as_ref() } } @@ -168,7 +169,7 @@ where /// With the `rand` feature flag disabled, the dropout layer is /// unavailable and returns `None`. #[cfg(not(feature = "rand"))] - pub fn dropout(&self) -> Option<&DropoutLayer> { + pub fn dropout(&self) -> Option<&Dropout> { None } } diff --git a/models/transformers/src/attention/mod.rs b/models/transformers/src/attention/mod.rs index a500b5f..2f4c729 100644 --- a/models/transformers/src/attention/mod.rs +++ b/models/transformers/src/attention/mod.rs @@ -9,10 +9,10 @@ //! the Transformer model, primarily due to its capabilities in natural language //! processing (NLP) domains pub(crate) use self::_impl_methods::*; -pub use self::head::AttentionHead; -pub use self::score::Score; +pub use self::{config::AttentionConfig, head::AttentionHead, score::Score}; pub use self::utils::*; +pub(crate) mod config; pub(crate) mod head; pub(crate) mod score; @@ -34,7 +34,7 @@ pub trait Attention { pub(crate) mod utils { use super::Score; - use concision::nn::DropoutLayer; + use concision::nn::Dropout; use nd::linalg::Dot; use nd::prelude::*; use num::complex::ComplexFloat; @@ -45,7 +45,7 @@ pub(crate) mod utils { k: &ArrayBase, v: &ArrayBase, mask: Option<&Array>, - dropout: Option<&DropoutLayer>, + dropout: Option<&Dropout>, ) -> Score where A: ComplexFloat + nd::ScalarOperand, @@ -60,7 +60,7 @@ pub(crate) mod utils { mod _impl_methods { use super::Score; - use concision::prelude::{DropoutLayer, MaskFill, Softmax}; + use concision::prelude::{Dropout, MaskFill, Softmax}; use nd::linalg::Dot; use nd::prelude::*; use num::complex::ComplexFloat; @@ -70,7 +70,7 @@ mod _impl_methods { k: &ArrayBase, v: &ArrayBase, mask: Option<&Array>, - dropout: Option<&DropoutLayer>, + dropout: Option<&Dropout>, ) -> Score where A: ComplexFloat + nd::ScalarOperand, diff --git a/models/transformers/src/attention/multi/mod.rs b/models/transformers/src/attention/multi/mod.rs index e101f03..4125126 100644 --- a/models/transformers/src/attention/multi/mod.rs +++ b/models/transformers/src/attention/multi/mod.rs @@ -5,12 +5,11 @@ //! # Multi-Head Attention //! //! -pub use self::{config::Config, multi_head::*}; +pub use self::multi_head::*; -pub(crate) mod config; +// pub(crate) mod config; pub(crate) mod multi_head; pub(crate) mod prelude { - pub use super::config::Config as MultiHeadAttentionConfig; pub use super::multi_head::MultiHeadAttention; } diff --git a/models/transformers/src/attention/multi/multi_head.rs b/models/transformers/src/attention/multi/multi_head.rs index 36a4051..f6baa7c 100644 --- a/models/transformers/src/attention/multi/multi_head.rs +++ b/models/transformers/src/attention/multi/multi_head.rs @@ -2,8 +2,7 @@ Appellation: multi_head Contrib: FL03 */ -use super::Config; -use crate::AttentionHead; +use crate::{attention::AttentionConfig, AttentionHead}; use linear::{Biased, Linear}; use nd::prelude::*; use nd::{DataOwned, OwnedRepr, RawData}; @@ -13,7 +12,7 @@ where D: Dimension, S: RawData, { - pub(crate) config: Config, + pub(crate) config: AttentionConfig, pub(crate) head: AttentionHead, pub(crate) linears: Vec>, } @@ -23,7 +22,7 @@ where D: Dimension, S: RawData, { - pub const fn config(&self) -> &Config { + pub const fn config(&self) -> &AttentionConfig { &self.config } @@ -49,7 +48,7 @@ where A: Clone + Default, S: DataOwned, { - let config = Config::new().d_model(d_model).heads(heads).build(); + let config = AttentionConfig::new(d_model, heads); let linears = (0..4) .map(|_| Linear::from_features(d_model, d_model)) .collect(); @@ -69,7 +68,7 @@ where { fn default() -> Self { Self { - config: Config::default(), + config: AttentionConfig::default(), head: AttentionHead::default(), linears: Vec::new(), } diff --git a/models/transformers/src/attention/score.rs b/models/transformers/src/attention/score.rs index 3e1df96..5451f7e 100644 --- a/models/transformers/src/attention/score.rs +++ b/models/transformers/src/attention/score.rs @@ -33,7 +33,6 @@ where pub fn into_score(self) -> Array { self.score } - /// Retrieve the attention tensor. pub fn attention(&self) -> &Array { &self.attention diff --git a/models/transformers/src/config/mod.rs b/models/transformers/src/config/mod.rs new file mode 100644 index 0000000..b052876 --- /dev/null +++ b/models/transformers/src/config/mod.rs @@ -0,0 +1,46 @@ +/* + Appellation: config + Contrib: FL03 +*/ + + +pub struct TransformerConfig { + pub heads: usize, +} + +pub struct Features { + + pub d_model: usize, + +} + +pub struct QkvShape { + pub dq: usize, + pub dk: usize, + pub dv: usize, +} + +impl QkvShape { + pub fn new(dq: usize, dk: usize, dv: usize) -> Self { + Self { + dq, + dk, + dv, + } + } + + pub fn std(dk: usize) -> Self { + let (dq, dv) = (dk, dk); + + Self::new(dq, dk, dv) + } +} + + +pub struct EmbedConfig { + +} + +pub struct FFNConfig { + +} \ No newline at end of file diff --git a/models/transformers/src/ffn/mod.rs b/models/transformers/src/ffn/mod.rs deleted file mode 100644 index 2ce65f8..0000000 --- a/models/transformers/src/ffn/mod.rs +++ /dev/null @@ -1,7 +0,0 @@ -/* - Appellation: model - Contrib: FL03 -*/ -pub use self::model::FeedForwardNetwork; - -pub(crate) mod model; diff --git a/models/transformers/src/lib.rs b/models/transformers/src/lib.rs index 6dae5eb..a6439f8 100644 --- a/models/transformers/src/lib.rs +++ b/models/transformers/src/lib.rs @@ -18,13 +18,7 @@ extern crate concision_linear as linear; extern crate ndarray as nd; #[doc(inline)] -pub use self::attention::prelude::{ - scaled_dot_product_attention, AttentionHead, MultiHeadAttention, -}; -#[doc(inline)] -pub use self::transformer::Transformer; -#[doc(inline)] -pub use self::{params::*, primitives::*}; +pub use self::{attention::prelude::*, params::prelude::*, ops::prelude::*, primitives::*, transformer::Transformer}; #[macro_use] pub(crate) mod macros; @@ -33,7 +27,7 @@ pub(crate) mod transformer; pub mod attention; pub mod codec; -pub mod ffn; +pub mod config; pub mod model; pub mod ops; pub mod params; @@ -46,5 +40,6 @@ mod impls { pub mod prelude { pub use super::attention::prelude::*; + pub use super::params::prelude::*; pub use super::Transformer; } diff --git a/models/transformers/src/ffn/model.rs b/models/transformers/src/model/ffn.rs similarity index 63% rename from models/transformers/src/ffn/model.rs rename to models/transformers/src/model/ffn.rs index 75f5803..00a29ab 100644 --- a/models/transformers/src/ffn/model.rs +++ b/models/transformers/src/model/ffn.rs @@ -2,32 +2,40 @@ Appellation: model Contrib: FL03 */ -use concision::prelude::{DropoutLayer, Forward, Predict, PredictError, ReLU}; +use concision::prelude::{Dropout, Forward, Predict, PredictError, ReLU}; use linear::{Biased, Linear, ParamMode}; use nd::prelude::*; use nd::{RemoveAxis, ScalarOperand}; use num::traits::Num; -// -pub struct FeedForwardNetwork +// #84: FeedForwardNetwork +/// A piecewise, feed-forward neural network consisting of two [Linear] layers with a ReLU activation function +/// optionally (and conditionally) supporting an [Dropout] layer. +/// +/// ### Shape +/// +/// - d_model: Embedding size +/// - d_ff: upward projection +/// +pub struct FeedForwardNetwork where D: Dimension, { #[cfg(feature = "rand")] - pub(crate) dropout: Option, + pub(crate) dropout: Option, pub(crate) input: Linear, pub(crate) output: Linear, } -impl FeedForwardNetwork +impl FeedForwardNetwork where K: ParamMode, { - pub fn new(d_model: usize, features: usize, dropout: Option) -> Self + pub fn std(d_model: usize, features: usize, dropout: Option) -> Self where A: Clone + Default, { - let dropout = dropout.map(|p| DropoutLayer::new(p)); + let dropout = dropout.map(|p| Dropout::new(p)); let input = Linear::from_features(d_model, features); let output = Linear::from_features(features, d_model); Self { @@ -38,7 +46,7 @@ where } } -impl FeedForwardNetwork +impl FeedForwardNetwork where D: Dimension, { @@ -52,26 +60,26 @@ where } #[cfg(feature = "rand")] -impl FeedForwardNetwork +impl FeedForwardNetwork where D: Dimension, { - pub fn dropout(&self) -> Option<&DropoutLayer> { + pub fn dropout(&self) -> Option<&Dropout> { self.dropout.as_ref() } } #[cfg(not(feature = "rand"))] -impl FeedForwardNetwork +impl FeedForwardNetwork where D: Dimension, { - pub fn dropout(&self) -> Option<&DropoutLayer> { + pub fn dropout(&self) -> Option<&Dropout> { None } } -impl Predict> for FeedForwardNetwork +impl Predict> for FeedForwardNetwork where B: Num + PartialOrd + ScalarOperand, D: RemoveAxis, diff --git a/models/transformers/src/model/mod.rs b/models/transformers/src/model/mod.rs index ac227da..35cce20 100644 --- a/models/transformers/src/model/mod.rs +++ b/models/transformers/src/model/mod.rs @@ -3,4 +3,5 @@ Contrib: FL03 */ +pub mod ffn; pub mod sublayer; diff --git a/models/transformers/src/model/sublayer.rs b/models/transformers/src/model/sublayer.rs index a1a5fbe..f6fbb6c 100644 --- a/models/transformers/src/model/sublayer.rs +++ b/models/transformers/src/model/sublayer.rs @@ -3,7 +3,7 @@ Contrib: FL03 */ #![cfg(feature = "rand")] -use concision::nn::DropoutLayer; +use concision::nn::Dropout; use concision::Forward; use linear::{Biased, LayerNorm, ParamMode, Unbiased}; use nd::prelude::*; @@ -16,7 +16,7 @@ pub struct Sublayer where D: Dimension, { - pub(crate) dropout: DropoutLayer, + pub(crate) dropout: Dropout, pub(crate) norm: LayerNorm, } @@ -31,12 +31,12 @@ where Sh: ShapeBuilder, { Self { - dropout: DropoutLayer::new(dropout), + dropout: Dropout::new(dropout), norm: LayerNorm::new(shape), } } - pub fn dropout(&self) -> &DropoutLayer { + pub fn dropout(&self) -> &Dropout { &self.dropout } diff --git a/models/transformers/src/params/mod.rs b/models/transformers/src/params/mod.rs index ba79e10..ba0136f 100644 --- a/models/transformers/src/params/mod.rs +++ b/models/transformers/src/params/mod.rs @@ -2,9 +2,9 @@ Appellation: params Contrib: FL03 */ -pub use self::{item::*, store::QkvBase}; +pub use self::{item::*, qkv::QkvBase}; -mod store; +mod qkv; pub mod item; @@ -31,7 +31,7 @@ params_ty!( #[allow(unused_imports)] pub(crate) mod prelude { - pub use super::item::{Entry, QKV}; - pub use super::store::QkvBase; + pub use super::item::QKV; + pub use super::qkv::QkvBase; pub use super::{ArcQkv, Qkv, ViewQkv}; } diff --git a/models/transformers/src/params/store.rs b/models/transformers/src/params/qkv.rs similarity index 93% rename from models/transformers/src/params/store.rs rename to models/transformers/src/params/qkv.rs index f59ee6e..d35298e 100644 --- a/models/transformers/src/params/store.rs +++ b/models/transformers/src/params/qkv.rs @@ -3,7 +3,7 @@ Contrib: FL03 */ use crate::attention::{Score, _attention}; -use concision::nn::DropoutLayer; +use concision::nn::Dropout; use concision::{dimensional, getters}; use nd::linalg::Dot; use nd::*; @@ -88,6 +88,12 @@ where qkv_view!(view_mut::<'a, ViewRepr>(&mut self) where S: DataMut); } +impl QkvBase where S: RawData { + pub fn std(dk: usize, d_model: usize) -> Self where A: Default, S: DataOwned { + Self::new((dk, d_model)) + } +} + #[cfg(not(feature = "rand"))] impl QkvBase where @@ -123,7 +129,7 @@ where ArrayBase: for<'a> Dot, Output = Array>, Array: Dot, Output = Array>, { - let dropout = dropout.map(DropoutLayer::new); + let dropout = dropout.map(Dropout::new); let (q, k, v) = self.qkv(); _attention(q, k, v, mask, dropout.as_ref()) } diff --git a/models/transformers/tests/ffn.rs b/models/transformers/tests/ffn.rs index 3bf9590..ad37ab8 100644 --- a/models/transformers/tests/ffn.rs +++ b/models/transformers/tests/ffn.rs @@ -8,14 +8,14 @@ extern crate concision_transformer as transformer; use concision::prelude::{linarr, Predict}; use linear::Biased; -use transformer::ffn::FeedForwardNetwork; +use transformer::model::ffn::FeedForwardNetwork; use ndarray::prelude::*; #[test] fn test_ffn() { let (samples, d_model, d_ff) = (100, 30, 3); - let model = FeedForwardNetwork::::new(d_model, d_ff, Some(0.1)); + let model = FeedForwardNetwork::::std(d_model, d_ff, Some(0.1)); let data = linarr::((samples, d_model)).unwrap(); diff --git a/models/transformers/tests/ops.rs b/models/transformers/tests/ops.rs index f8407de..e9b5d0c 100644 --- a/models/transformers/tests/ops.rs +++ b/models/transformers/tests/ops.rs @@ -18,7 +18,7 @@ fn test_merge() { let shape = (3, 4, 5); let dout = (4, 15); let arr = linarr::(shape.clone()).unwrap(); - let a = arr.clone().merge().unwrap(); + let a = arr.merge().unwrap(); assert_eq!(a.dim(), dout); assert_eq!(a, utils::merge3(&arr).unwrap());