Pico GPT2 in Aesara

import aesara.tensor as at
import numpy as np

def gelu(x):
    return 0.5 * x * (1 + at.tanh(at.sqrt(2 / np.pi)) * (x + 0.044715 * x ** 3))

x = at.scalar('x')
y = gelu(x)

import aesara
aesara.dprint(y)

x = at.vector('x')

def softmax(x):
    return at.exp(x) / at.sum(at.exp(x))

y  = softmax(x)
fn = aesara.function([x], y)
aesara.dprint(y)
aesara.dprint(fn)

def layer_norm(x, g, b, eps = 1e-5):
    mean = at.mean(x)
    var = at.var(x)
    return g * (x - mean) / at.sqrt(var + eps) + b

x = at.vector('x')
y = layer_norm(x, 1., 1.)
aesara.dprint(y)

def linear(x, w, b):
    return x @ w + b

def ffn(x, c_fc, c_proj):
    return linear(gelu(linear(x, *c_fc)), *c_proj)

def attention(q, k, v, mask):
    return softmax(q @ k.T / at.sqrt(q.shape[-1]) + mask) @ v

def mha(x, c_attn, c_proj, n_head):
    x = linear(x, *c_attn)
    qkv = at.split(x, 3, axis=-1)
    qkv_heads =
    causal_mask =
    out_heads =
    x =
    x = linear(x, *c_proj)