当前位置: 首页 > news >正文

复现一个nanoGPT——model.py

由于代码较长,最好的方式是逐个模块实现:

1.LayerNorm

class LayerNorm(nn.Module):def __init__(self, ndim, bias=None):super().__init__()self.weight = nn.Parameter(torch.ones(ndim))self.bias = nn.Parameter(torch.zeros(ndim)) if bias else Nonedef forward(self, input):return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)测试:
x = torch.randn(4, 2, 8)
ndim = 8my_layernorm = Layer_Norm(ndim)
my_output = my_layernorm(x)layernorm = nn.LayerNorm(ndim)
layernorm.weight.data = my_layernorm.weight.clone()
if my_layernorm.bias is not None:layernorm.bias.data = my_layernorm.bias.clone()official_output = layernorm(x)
print("差异是否很小:", torch.allclose(my_output, official_output, atol=1e-6))

2.Attention: 

class CausalSelfAttention(nn.Module):def __init__(self, config):super().__init__()self.embd = config.embedding_sizeself.n_head = config.n_headassert self.embd % self.n_head == 0self.dropout = config.dropoutself.bias = config.biasself.c_attn = nn.Linear(self.embd, 3 * self.embd, self.bias)self.c_proj = nn.Linear(self.embd, self.embd, self.bias)self.resid_dropout = nn.Dropout(self.dropout)self.attn_dropout = nn.Dropout(self.dropout)self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')if not self.flash:print("Warning!Current Torch dosen't have scaled_dot_product_attention")self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))def forward(self, x):B, T, C = x.size()qkv = self.c_attn(x)q, k, v = torch.split(qkv, self.embd, dim=2)q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)if self.flash:y = F.scaled_dot_product_attention(q, k, v, attn_mask=None, is_causal=True, dropout_p=self.dropout if self.training else 0)else:att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))att = att.masked_fill(self.bias[:, :, :T, :T], float('-inf'))att = F.softmax(att)att = self.attn_dropout(att)y = att @ vy = y.transpose(1, 2).contiguous().view(B, T, C)y = self.resid_dropout(self.c_proj(y))return y

 3.MLP、Block和Config 

class MLP(nn.Module):def __init__(self, config):super().__init__()self.l_1 = nn.Linear(config.embedding_size, 4 * config.embedding_size, config.bias)self.gelu = nn.GELU()self.l_2 = nn.Linear(config.embedding_size * 4, config.embedding_size, config.bias)self.dropout = nn.Dropout(config.dropout)def forward(self, x):x = self.l_1(x)x = self.gelu(x)x = self.l_2(x)x = self.dropout(x)return xclass Block(nn.Module):def __init__(self, config):super().__init__()self.ln1 = LayerNorm(config.embedding_size, config.bias)self.att = CausalSelfAttention(config)self.ln2 = LayerNorm(config.embedding_size, config.bias)self.mlp = MLP(config)def forward(self, x):x = x + self.att(self.ln1(x))x = x + self.mlp(self.ln2(x))return xclass Config(nn.Module):n_head: int = 12embedding_size: int = 768bias: bool = Truedropout: float = 0.0block_size: int = 1024#n_layer:#vocab_size:

前四个部分复现中易出现的错误:

(1) CausalSelfAttention.forwardx.shape() 错误
B, T, C = x.shape()

应为:

B, T, C = x.shape

原因:shape 是一个属性,不是方法。


(2)注意在使用scaled_dot_product_attention时的dropout需要判断模型是训练态还是测试态,测试态不需要dropout
y = F.scaled_dot_product_attention(q, k, v, dropout_p=self.dropout if self.training else 0,is_causal=True)

⚠️解释:其他的不用是应为dropout时使用的nn.Dropout(),里面会自动判断模型是什么状态从而确定是否dropout。


(3)self.bias = torch.tril(...) 写法错误,tril中应该是一个tensor
self.register_buffer("bias", torch.tril([config.block_size, config.block_size])...)

应为:

self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))

torch.tril() 需要的是 Tensor,不是 list。


(5)初始化class时不要忘记super().__init__()

4.GPT

首先给出GPT模型的三个最主要的功能函数,init、forward和generate

 (1)__init__()

    def __init__(self, config):super().__init__()assert config.vocab_size is not Noneassert config.block_size  is not Noneself.config = config#pos = torch.arange(config.block_size)self.transformer = nn.ModuleDict(dict(wte = nn.Embedding(config.vocab_size, config.n_embd),wpe = nn.Embedding(config.block_size, config.n_embd),h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),drop = nn.Dropout(config.dropout),ln_f = LayerNorm(config.n_embd, bias=config.bias)))self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=config.bias)self.transformer.wte.weight = self.lm_head.weight #权重共享#初始化权重self.apply(self.init_weight_)for np, p in self.named_parameters():if np.endswith('c_proj.weight'):torch.nn.init.normal_(p, mean=0.0, std=0.02 / math.sqrt(2 * config.n_layer))print("num of parameters:%.2fM" % (self.get_num_params()/1e6,))

关于权重共享:建议是lm_head的权重覆盖wte的weight,因为如果反过来lm_head的输出权重会被抹掉,模型就会失效

(2)forward

    def forward(self, x, target=None):device = x.deviceb, t = x.size()assert t <= self.config.block_sizepos = torch.arange(0, t, dtype=torch.long, device=device)token_emb = self.transformer.wte(x)pos_emb = self.transformer.wpe(pos)x = self.transformer.drop(token_emb + pos_emb)for block in self.transformer.h:x = block(x)x = self.transformer.ln_f(x)if target is not None:logits = self.lm_head(x)loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1), ignore_index=-1)#-1:合并除最后一维外的维度else:logits = self.lm_head(x[:, [-1], :])loss = Nonereturn logits, loss

判断target是否为空实际上是在判断是否为训练模式,下面的logits=self.lm_head(x[:, [-1],  :])保留第二个维度是因为要和target不为None时的logits维度保持一致。

(3)generate

    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):for _ in max_new_tokens:idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]logits, _ = self(idx_cond)logits = logits[:, -1, :]logits = logits / temperatureif top_k is not None:v, _ = torch.topk(logits, min(top_k, logits.size(-1)))logits[logits < v[:, [-1]]] = -float('inf')probs = F.softmax(logits, dim=-1)idx_next = torch.multinomial(probs, num_samples=1)idx = torch.cat((idx, idx_next), dim=1)return idx

(4)crop_block_size,config_optimizer,init_weight,get_num_params, from_pretrained

其他的功能是在train和sample中用到才构建的,比如crop_block_size是针对输入的block_size小于config中的block_size这种情况设置的,又如from_pretrained是针对继承之前训练过的模型这种情况,figure_optimize是生成optimizer,estimate_mfu是bench中评估模型的

    def init_weight_(self, module):if isinstance(module, nn.Linear):torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)if module.bias is not None:torch.nn.init.zeros_(module.bias)elif isinstance(module, nn.Embedding):torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)def get_num_params(self, non_embedding=True):n_params = sum(p.numel() for p in self.parameters())if non_embedding:n_params -= self.transformer.wpe.weight.numel()#这里不减去wte的原因是wte的权重和lm_head共享return n_paramsdef crop_block_size(self, block_size):assert block_size <= self.config.block_sizeself.config.block_size = block_sizeself.transformer.wpe.weight=nn.Parameter(self.transformer.wpe.weight[:block_size])for block in self.transformer.h:if hasattr(block.attn, 'bias'):block.attn.bias = block.attn.bias[:, :, :block_size, :block_size]def from_pretrained(self, model_type, override_arg=None):assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}override_arg = override_arg or {}assert all(k == 'dropout' for k in override_arg)from transformers import GPT2LMHeadModelprint("Loading model from pretrained_model %s" % model_type)#定义configconfig_args = {'gpt2':        dict(n_layer=12, n_head=12, n_embd=768),'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024),'gpt2-large':  dict(n_layer=36, n_head=20, n_embd=1280),'gpt2-xl':     dict(n_layer=48, n_head=25, n_embd=1600),}[model_type]print("Enforcign the bias=, block_size=m=, vocab_size=")config_args['bias'] = Trueconfig_args['block_size'] = 1024config_args['vocab_size'] = 50257if 'dropout' in override_arg:print(f"updata the dropout rate to {override_arg['dropout']}")config_args['dropout'] = override_arg['dropout']#定义modelconfig = GPTConfig(**config_args)model = GPT(config)#转移权重sd = model.state_dict()sd_keys = sd.keys()sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')]model_hf = GPT2LMHeadModel.from_pretrained(model_type)sd_hf = model_hf.state_dict()sd_hf_keys = sd_hf.keys()sd_hf_keys = [k for k in sd_hf_keys if not k.endswith('.attn.bias')]sd_hf_keys = [k for k in sd_hf_keys if not k.endswith('.attn.masked_bias')]transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', ',lp.c_proj.weight']assert len(sd_hf_keys) == len(sd_keys)for k in sd_hf_keys:if any(k.endswith(w) for w in transposed):assert sd[k].shape[::-1] == sd_hf[k].shapewith torch.no_grad():sd[k].copy_(sd_hf[k].t())else:assert sd[k].shape == sd_hf[k].shapewith torch.no_grad():sd[k].copy_(sd_hf[k])return modeldef configure_optimizers(self, weight_decay, learning_rate, betas, device_type):params = {np: p for np, p in self.named_parameters()}params = {np: p for np, p in params.items() if p.requires_grad}params_decay = [p for np, p in params.items() if p.dim() >= 2]params_nodecay = [p for np, p in params.items() if p.dim() < 2]optim_groups = [{'params':params_decay, 'weight_decay':weight_decay},{'params':params_nodecay, 'weight_decay':0.0}]num_decay_params = sum(p.numel() for p in params_decay)num_nodecay_params = sum(p.numel() for p in params_nodecay)print(f"num decayed parameters tensors:{len(params_decay)}, with {num_decay_params} parameters")print(f"num nodecayed parameters tensors:{len(params_nodecay)}, with {num_nodecay_params} parameters")fused_available = 'fuse' in inspect.signature(torch.optim.AdamW).parametersuse_fused = device_type == 'cuda' and fused_availableextra_arg = dict(fused=True) if use_fused else dict()optimizer = torch.optim.AdamW(optim_groups, learning_rate, betas, **extra_arg)return optimizerdef estimate_mfu(self, fwdbwd_per_iter, dt):N = self.get_num_params()cfg = self.configL, H, Q, T = cfg.n_layer, cfg.n_head, cfg.n_embd // cfg.n_head, cfg.block_sizeflops_per_token = 6 * N + 12 * L * H * Q * Tflops_per_fwdbwd = flops_per_token * Tflops_per_iter = flops_per_fwdbwd * fwdbwd_per_iterflops_achieved = flops_per_iter * (1.0 / dt)flop_promised = 312e12mfu = flops_achieved / flop_promisedreturn mfu

GPT中可能出现的错误

(1)权重共享前后顺序

(2)初始化权重的时候不要忘记c_proj.weight中对std又进一步缩小

(3)init_wieght_中如果是Linear要判断是否又bias

(4)get_num_params中不减wte是因为wte和lm_head是权重共享

(5)关于crop_block_size,之所以只修改transformer.h的attn.bias和wpe.weight 是因为整个模型中只有这两个地方使用了block_size

(6)还有就是测试的时候使用torch,randint生成后要转为float才能进模型

http://www.lqws.cn/news/594667.html

相关文章:

  • 【PDF-XSS攻击】springboot项目-上传文件-解决PDF文件XSS攻击
  • [密码学实战]深入解析ASN.1和DER编码:以数字签名值为例
  • 用openCV实现基础的人脸检测与情绪识别
  • 华为交换机堆叠与集群技术深度解析附带脚本
  • Sketch v2025「Athens」全新发布,3大更新重塑UI/UX设计的关键逻辑
  • stm32 单片机主要优点有哪些?
  • SAP ABAP 中 AMDP 简介及实现方法
  • Spring Boot 集成 Dufs 通过 WebDAV 实现文件管理
  • ES05 - 集群的运维和安全
  • 玄机——第一章应急响应-Linux日志分析
  • AILiquid线上AMA首秀,全链AI驱动的去中心化合约平台引发关注
  • 【项目笔记】高并发内存池项目剖析(二)
  • npm list的使用方法详细介绍
  • 【开源项目】一款真正可修改视频MD5工具视频质量不损失
  • uniapp+vue写小程序页面,实现一张图片默认放大后,可以在容器内上下左右拖动查看
  • 前端第二节(Vue)
  • 【实战】 容器中Spring boot项目 Graphics2D 画图中文乱码解决方案
  • anchor 智能合约案例3 之 journal
  • Docker进阶命令与参数——AI教你学Docker
  • 想做跑腿配送生意,怎么第三方平台订单对接?
  • MCU、LIN收发器、LIN总线、节点,它们之间是如何协作的?
  • SVN 分支管理(本文以Unity项目为例)
  • 以下是 Kafka 不同认证方式的配置示例,结合前面的单表设计方案,展示如何为每种认证方式填充配置表
  • 【Go-选项模式】
  • Spring Boot 2 多模块项目中配置文件的加载顺序
  • 2025年主流大厂Java后端面试题主题深度解析
  • 【深度学习新浪潮】人工智能在文物考古领域有哪些最新研究进展?
  • 基于开源AI大模型AI智能名片S2B2C商城小程序的流量转化与价值沉淀研究
  • 借助飞算AI新手小白快速入门Java实操记录
  • AbMole| H₂DCFDA(M9096;活性氧(ROS)探针)