From 444d1f566cf3ebad6911567d98a778fe8f5965aa Mon Sep 17 00:00:00 2001 From: Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> Date: Thu, 2 May 2024 19:39:12 +0200 Subject: [PATCH 01/14] Enable gguf conversion with qk_norm stacking --- neural_speed/convert/convert_stablelm.py | 72 +++++++++++++++++++----- 1 file changed, 57 insertions(+), 15 deletions(-) diff --git a/neural_speed/convert/convert_stablelm.py b/neural_speed/convert/convert_stablelm.py index f5f1d43fd..25fe9ad5f 100644 --- a/neural_speed/convert/convert_stablelm.py +++ b/neural_speed/convert/convert_stablelm.py @@ -20,6 +20,7 @@ # This script is similar to "convert-pt-to-ne.py" # import os +import sys import struct import numpy as np from pathlib import Path @@ -27,6 +28,7 @@ from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar, Union) from transformers import AutoModelForCausalLM, AutoTokenizer +import torch import gguf # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py @@ -51,26 +53,50 @@ def bytes_to_unicode(): cs = [chr(n) for n in cs] return dict(zip(bs, cs)) + +def stack_qk_norm(block_count, name, n_head, norms, n_dims, ftype, layer_name="q_layernorm"): + for bid in range(block_count): + datas = [] + for xid in range(n_head): + ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight" + print(f"-----> Merging Tensor {ename} with shape {norms[ename].shape} <-----") + datas.append(norms[ename]) + del norms[ename] + data = np.stack(datas, axis=0) + data_dtype = data.dtype + merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight" + + if ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or merged_name.endswith("_norm.weight")): + data = data.astype(np.float32) + if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not merged_name.endswith("_norm.weight") and n_dims == 2: + data = data.astype(np.float16) + + return merged_name, data + + def stablelm_convert_gguf(model, tokenizer, dir_model, fname_out, ftype, hparams): print("stablelm.gguf converting: ") list_vars = model.state_dict() + n_head = hparams["num_attention_heads"] + n_kv_head = hparams["num_key_value_heads"] + block_count = hparams["num_hidden_layers"] n_rot = int(hparams["partial_rotary_factor"] * hparams["hidden_size"] / hparams["num_attention_heads"]) for name in list_vars.keys(): print(name, list_vars[name].shape, list_vars[name].dtype) print(hparams) - gguf_file = fname_out + '.gguf' + gguf_file = fname_out + '.gguf' if not fname_out.endswith(".gguf") else fname_out gguf_writer = gguf.GGUFWriter(gguf_file, "stablelm") gguf_writer.add_uint32('magic', 0x67676d66) gguf_writer.add_uint32('version', 1) gguf_writer.add_uint32('n_vocab', hparams["vocab_size"]) gguf_writer.add_embedding_length(hparams["hidden_size"]) - gguf_writer.add_head_count(hparams["num_attention_heads"]) - gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) + gguf_writer.add_head_count(n_head) + gguf_writer.add_head_count_kv(n_kv_head) - gguf_writer.add_block_count(hparams["num_hidden_layers"]) + gguf_writer.add_block_count(block_count) gguf_writer.add_rope_dimension_count(n_rot) gguf_writer.add_uint32('ftype', ftype) gguf_writer.add_context_length(hparams["max_position_embeddings"]) @@ -118,32 +144,48 @@ def write_vocab_gguf(dir_model, hparams, gguf_writer): # tensor info print("gguf: get tensor metadata") - for name in list_vars.keys(): - data = list_vars[name].squeeze().numpy() - - print("Processing variable: " + name + " with shape: ", data.shape) - if 'inv_freq' in name: + q_norms, k_norms = dict(), dict() + for name, data_torch in list_vars.items(): + # Convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + # Skip some tensors + if name.endswith((".attention.rotary_emb.inv_freq")): continue + old_dtype = data.dtype n_dims = len(data.shape) - + if name.find("q_layernorm.norms") != -1: + q_norms[name] = data + if len(q_norms) >= (block_count * n_head): + name, data = stack_qk_norm(block_count, name, n_head, q_norms, n_dims, ftype, layer_name="q_layernorm") + print(f"Processing variable {name} with shape {data.shape}, {old_dtype} --> {data.dtype}") + gguf_writer.add_tensor(name, data) + continue + if name.find("k_layernorm.norms") != -1: + k_norms[name] = data + if len(k_norms) >= (block_count * n_kv_head): + name, data = stack_qk_norm(block_count, name, n_kv_head, k_norms, n_dims, ftype, layer_name="k_layernorm") + print(f"Processing variable {name} with shape {data.shape}, {old_dtype} --> {data.dtype}") + gguf_writer.add_tensor(name, data) + continue + # ftype == 0 -> float32, ftype == 1 -> float16 - ftype_cur = 0 if ftype != 0: - if name[-7:] == ".weight" and n_dims == 2: + if name.endswith(".weight") and not name.endswith("_norm.weight") and n_dims == 2: print(" Converting to float16") data = data.astype(np.float16) - ftype_cur = 1 else: print(" Converting to float32") data = data.astype(np.float32) - ftype_cur = 0 else: if data.dtype != np.float32: print(" Converting to float32") data = data.astype(np.float32) - ftype_cur = 0 + print(f"Processing variable {name} with shape {data.shape}, {old_dtype} --> {data.dtype}") gguf_writer.add_tensor(name, data) print("gguf: write header") From 7d99fabc08c3a0719eac3fac7512433288270609 Mon Sep 17 00:00:00 2001 From: Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> Date: Sat, 4 May 2024 11:45:54 +0200 Subject: [PATCH 02/14] Convert function NE format --- neural_speed/convert/convert_stablelm.py | 119 +++++++++++++++-------- 1 file changed, 76 insertions(+), 43 deletions(-) diff --git a/neural_speed/convert/convert_stablelm.py b/neural_speed/convert/convert_stablelm.py index 25fe9ad5f..80eefa7ed 100644 --- a/neural_speed/convert/convert_stablelm.py +++ b/neural_speed/convert/convert_stablelm.py @@ -55,21 +55,29 @@ def bytes_to_unicode(): def stack_qk_norm(block_count, name, n_head, norms, n_dims, ftype, layer_name="q_layernorm"): - for bid in range(block_count): + for block in range(block_count): datas = [] - for xid in range(n_head): - ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight" - print(f"-----> Merging Tensor {ename} with shape {norms[ename].shape} <-----") + for i in range(n_head): + ename = f"model.layers.{block}.self_attn.{layer_name}.norms.{i}.weight" + print(f"-----> Merging Tensor {ename} with shape {norms[ename].shape}") datas.append(norms[ename]) del norms[ename] data = np.stack(datas, axis=0) data_dtype = data.dtype - merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight" + merged_name = f"model.layers.{block}.self_attn.{layer_name}.weight" - if ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or merged_name.endswith("_norm.weight")): - data = data.astype(np.float32) - if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not merged_name.endswith("_norm.weight") and n_dims == 2: - data = data.astype(np.float16) + # ftype == 0 -> float32, ftype == 1 -> float16 + if ftype != 0: + if name.endswith(".weight") and not name.endswith("_norm.weight") and n_dims == 2: + print(" Converting to float16") + data = data.astype(np.float16) + else: + print(" Converting to float32") + data = data.astype(np.float32) + else: + if data.dtype != np.float32: + print(" Converting to float32") + data = data.astype(np.float32) return merged_name, data @@ -150,11 +158,11 @@ def write_vocab_gguf(dir_model, hparams, gguf_writer): if data_torch.dtype not in (torch.float16, torch.float32): data_torch = data_torch.to(torch.float32) - data = data_torch.squeeze().numpy() # Skip some tensors if name.endswith((".attention.rotary_emb.inv_freq")): continue + data = data_torch.squeeze().numpy() old_dtype = data.dtype n_dims = len(data.shape) if name.find("q_layernorm.norms") != -1: @@ -201,15 +209,17 @@ def write_vocab_gguf(dir_model, hparams, gguf_writer): print("") def stablelm_convert(model, tokenizer, dir_model, fname_out, ftype, hparams): - n_rot = int(hparams["partial_rotary_factor"] * hparams["hidden_size"] / hparams["num_attention_heads"]) - model.eval() - for p in model.parameters(): - p.requires_grad = False - hparams = model.config.to_dict() + print("stablelm ne converting: ") + list_vars = model.state_dict() + n_head = hparams["num_attention_heads"] + n_kv_head = hparams["num_key_value_heads"] + block_count = hparams["num_hidden_layers"] vocab_size = hparams["vocab_size"] + n_rot = int(hparams["partial_rotary_factor"] * hparams["hidden_size"] / hparams["num_attention_heads"]) print("Model loaded: ", dir_model) - fout = open(fname_out, "wb") + ne_file = fname_out + '.bin' if not fname_out.endswith(".bin") else fname_out + fout = open(ne_file, "wb") # 0x67676d6c is unversioned ne # 0x67676d66 is versioned ggmf (requires token scores) @@ -264,46 +274,69 @@ def stablelm_convert(model, tokenizer, dir_model, fname_out, ftype, hparams): fout.write(struct.pack("i", len(text))) fout.write(text) fout.write(struct.pack("f", -10000)) - - list_vars = model.state_dict() + + def write_header(name, data): + tmp = name.encode('utf-8') + n_dims = len(data.shape) + fout.write(struct.pack("iii", n_dims, len(tmp), ftype)) + for i in range(n_dims): + fout.write(struct.pack("i", data.shape[n_dims - 1 - i])) + print(tmp) + fout.write(tmp) print(hparams) + q_norms, k_norms = dict(), dict() + for name, data_torch in list_vars.items(): + # Convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) - for name in list_vars.keys(): - # No gradients for these - list_vars[name].requires_grad = False - src = name - print(src, ' -> ', name) - data = list_vars[src].squeeze().numpy() - data = data.astype(np.float32) - + # Skip some tensors + if name.endswith((".attention.rotary_emb.inv_freq")): + continue + + data = data_torch.squeeze().numpy() + old_dtype = data.dtype n_dims = len(data.shape) - print(name, n_dims, data.shape) - - # default type is fp32 - ftype_cur = 0 - if ftype == 1 and n_dims > 1: - print(" Converting to float16", data.shape, data[:3, :3].tolist()) - data = data.astype(np.float16) - ftype_cur = 1 + if name.find("q_layernorm.norms") != -1: + q_norms[name] = data + if len(q_norms) >= (block_count * n_head): + name, data = stack_qk_norm(block_count, name, n_head, q_norms, n_dims, ftype, layer_name="q_layernorm") + print(f"Processing variable {name} with shape {data.shape}, {old_dtype} --> {data.dtype}") + write_header(name, data) + data.tofile(fout) + continue + if name.find("k_layernorm.norms") != -1: + k_norms[name] = data + if len(k_norms) >= (block_count * n_kv_head): + name, data = stack_qk_norm(block_count, name, n_kv_head, k_norms, n_dims, ftype, layer_name="k_layernorm") + print(f"Processing variable {name} with shape {data.shape}, {old_dtype} --> {data.dtype}") + write_header(name, data) + data.tofile(fout) + continue + + # ftype == 0 -> float32, ftype == 1 -> float16 + if ftype != 0: + if name.endswith(".weight") and not name.endswith("_norm.weight") and n_dims == 2: + print(" Converting to float16") + data = data.astype(np.float16) + else: + print(" Converting to float32") + data = data.astype(np.float32) else: - print(" Converting to float32", data.shape, data[:3, :3].tolist() if n_dims > 1 else data[:3].tolist()) - data = data.astype(np.float32) + if data.dtype != np.float32: + print(" Converting to float32") + data = data.astype(np.float32) # header - str = name.encode('utf-8') - fout.write(struct.pack("iii", n_dims, len(str), ftype_cur)) - for i in range(n_dims): - fout.write(struct.pack("i", data.shape[n_dims - 1 - i])) - print(str) - fout.write(str) + write_header(name, data) # data data.tofile(fout) fout.close() - print("Done. Output file: " + fname_out) + print("Done. Output file: " + ne_file) print("") def main(args_in: Optional[List[str]] = None) -> None: From c4583ec26c8b98df90e408d00e5a229bb7d968c2 Mon Sep 17 00:00:00 2001 From: Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> Date: Mon, 6 May 2024 10:08:10 +0200 Subject: [PATCH 03/14] Add stablelm-2-12b config --- neural_speed/models/stablelm/stablelm.h | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/neural_speed/models/stablelm/stablelm.h b/neural_speed/models/stablelm/stablelm.h index 3df5b75cb..1dcd028fb 100644 --- a/neural_speed/models/stablelm/stablelm.h +++ b/neural_speed/models/stablelm/stablelm.h @@ -20,16 +20,31 @@ enum stablelm_model { STABLELM_UNKNOWN, - STABLELM_1_6B, + STABLELM_2_1_6B, + STABLELM_2_12B, STABLELM_3B, }; static const model_scratch stablelm_mem_req(int n_layers) { switch (n_layers) { - case 24: - return {512ull * MB, 512ull * MB, 1026ull * MB}; // StableLM2-1.6B & StableLM2-Zephyr-1.6B - case 32: - return {1024ull * MB, 1024ull * MB, 1026ull * MB}; // StableLM-3B + case 24: // StableLM-2-1.6B & StableLM-2-Zephyr-1.6B + return { + static_cast(scratch_size_ratio * 512) * MB, + static_cast(scratch_size_ratio * 512) * MB, + static_cast(scratch_size_ratio * 1024) * MB, + }; + case 32: // StableLM-3B & Stable-Code-3B + return { + static_cast(scratch_size_ratio * 1024) * MB, + static_cast(scratch_size_ratio * 1024) * MB, + static_cast(scratch_size_ratio * 1024) * MB, + }; + case 40: // StableLM-2-12B + return { + static_cast(scratch_size_ratio * 2560) * MB, + static_cast(scratch_size_ratio * 2560) * MB, + static_cast(scratch_size_ratio * 5120) * MB, + }; default: MODEL_ASSERT(false); } From df1d7e5e8f2afd04d3a05af04273f3359f85dbe3 Mon Sep 17 00:00:00 2001 From: Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> Date: Mon, 6 May 2024 16:45:35 +0200 Subject: [PATCH 04/14] Fix convert stacking + tensor allocation --- neural_speed/convert/convert_stablelm.py | 86 ++++++++++--------- .../models/stablelm/stablelm_utils.cpp | 24 ++++-- 2 files changed, 62 insertions(+), 48 deletions(-) diff --git a/neural_speed/convert/convert_stablelm.py b/neural_speed/convert/convert_stablelm.py index 80eefa7ed..1b3b97a7c 100644 --- a/neural_speed/convert/convert_stablelm.py +++ b/neural_speed/convert/convert_stablelm.py @@ -54,32 +54,30 @@ def bytes_to_unicode(): return dict(zip(bs, cs)) -def stack_qk_norm(block_count, name, n_head, norms, n_dims, ftype, layer_name="q_layernorm"): - for block in range(block_count): - datas = [] - for i in range(n_head): - ename = f"model.layers.{block}.self_attn.{layer_name}.norms.{i}.weight" - print(f"-----> Merging Tensor {ename} with shape {norms[ename].shape}") - datas.append(norms[ename]) - del norms[ename] - data = np.stack(datas, axis=0) - data_dtype = data.dtype - merged_name = f"model.layers.{block}.self_attn.{layer_name}.weight" - - # ftype == 0 -> float32, ftype == 1 -> float16 - if ftype != 0: - if name.endswith(".weight") and not name.endswith("_norm.weight") and n_dims == 2: - print(" Converting to float16") - data = data.astype(np.float16) - else: - print(" Converting to float32") - data = data.astype(np.float32) +def stack_qk_norm(block, name, n_head, norms, n_dims, ftype, layer_name="q_layernorm"): + datas = [] + for i in range(n_head): + ename = f"model.layers.{block}.self_attn.{layer_name}.norms.{i}.weight" + print(f"-----> Merging Tensor {ename} with shape {norms[ename].shape}") + datas.append(norms[ename]) + del norms[ename] + data = np.stack(datas, axis=0) + merged_name = f"model.layers.{block}.self_attn.{layer_name}.weight" + + # ftype == 0 -> float32, ftype == 1 -> float16 + if ftype != 0: + if name.endswith(".weight") and not name.endswith("_norm.weight") and n_dims == 2: + print(" Converting to float16") + data = data.astype(np.float16) else: - if data.dtype != np.float32: - print(" Converting to float32") - data = data.astype(np.float32) + print(" Converting to float32") + data = data.astype(np.float32) + else: + if data.dtype != np.float32: + print(" Converting to float32") + data = data.astype(np.float32) - return merged_name, data + return merged_name, data def stablelm_convert_gguf(model, tokenizer, dir_model, fname_out, ftype, hparams): @@ -168,16 +166,18 @@ def write_vocab_gguf(dir_model, hparams, gguf_writer): if name.find("q_layernorm.norms") != -1: q_norms[name] = data if len(q_norms) >= (block_count * n_head): - name, data = stack_qk_norm(block_count, name, n_head, q_norms, n_dims, ftype, layer_name="q_layernorm") - print(f"Processing variable {name} with shape {data.shape}, {old_dtype} --> {data.dtype}") - gguf_writer.add_tensor(name, data) + for block in range(block_count): + name, data = stack_qk_norm(block, name, n_head, q_norms, n_dims, ftype, layer_name="q_layernorm") + print(f"Processing variable {name} with shape {data.shape}, {old_dtype} --> {data.dtype}") + gguf_writer.add_tensor(name, data) continue if name.find("k_layernorm.norms") != -1: k_norms[name] = data if len(k_norms) >= (block_count * n_kv_head): - name, data = stack_qk_norm(block_count, name, n_kv_head, k_norms, n_dims, ftype, layer_name="k_layernorm") - print(f"Processing variable {name} with shape {data.shape}, {old_dtype} --> {data.dtype}") - gguf_writer.add_tensor(name, data) + for block in range(block_count): + name, data = stack_qk_norm(block, name, n_kv_head, k_norms, n_dims, ftype, layer_name="k_layernorm") + print(f"Processing variable {name} with shape {data.shape}, {old_dtype} --> {data.dtype}") + gguf_writer.add_tensor(name, data) continue # ftype == 0 -> float32, ftype == 1 -> float16 @@ -228,11 +228,11 @@ def stablelm_convert(model, tokenizer, dir_model, fname_out, ftype, hparams): fout.write(struct.pack("i", ne_file_magic)) # magic: ne in hex fout.write(struct.pack("i", 1)) - fout.write(struct.pack("i", hparams["vocab_size"])) + fout.write(struct.pack("i", vocab_size)) fout.write(struct.pack("i", hparams["hidden_size"])) fout.write(struct.pack("i", 0)) - fout.write(struct.pack("i", hparams["num_attention_heads"])) - fout.write(struct.pack("i", hparams["num_key_value_heads"])) # multi-query attention + fout.write(struct.pack("i", n_head)) + fout.write(struct.pack("i", n_kv_head)) # multi-query attention fout.write(struct.pack("i", hparams["num_hidden_layers"])) fout.write(struct.pack("i", n_rot)) fout.write(struct.pack("i", ftype)) @@ -249,7 +249,7 @@ def stablelm_convert(model, tokenizer, dir_model, fname_out, ftype, hparams): fout.write(struct.pack("i", 0)) # n_experts fout.write(struct.pack("i", 0)) # n_expert_used - fout.write(struct.pack("i", 0)) # n_embd_head_k for gemma + fout.write(struct.pack("i", hparams["hidden_size"] // n_head)) # n_embd_head_k for gemma fout.write(struct.pack("f", hparams.get("layer_norm_eps", 1e-5))) # rms_norm_eps or layer_norm_eps fout.write(struct.pack("f", hparams["rope_theta"])) # freq_base fout.write(struct.pack("f", 1.0)) # freq_scale, was removed in config.json (by default=1.0) @@ -301,18 +301,20 @@ def write_header(name, data): if name.find("q_layernorm.norms") != -1: q_norms[name] = data if len(q_norms) >= (block_count * n_head): - name, data = stack_qk_norm(block_count, name, n_head, q_norms, n_dims, ftype, layer_name="q_layernorm") - print(f"Processing variable {name} with shape {data.shape}, {old_dtype} --> {data.dtype}") - write_header(name, data) - data.tofile(fout) + for block in range(block_count): + name, data = stack_qk_norm(block, name, n_head, q_norms, n_dims, ftype, layer_name="q_layernorm") + print(f"Processing variable {name} with shape {data.shape}, {old_dtype} --> {data.dtype}") + write_header(name, data) + data.tofile(fout) continue if name.find("k_layernorm.norms") != -1: k_norms[name] = data if len(k_norms) >= (block_count * n_kv_head): - name, data = stack_qk_norm(block_count, name, n_kv_head, k_norms, n_dims, ftype, layer_name="k_layernorm") - print(f"Processing variable {name} with shape {data.shape}, {old_dtype} --> {data.dtype}") - write_header(name, data) - data.tofile(fout) + for block in range(block_count): + name, data = stack_qk_norm(block, name, n_kv_head, k_norms, n_dims, ftype, layer_name="k_layernorm") + print(f"Processing variable {name} with shape {data.shape}, {old_dtype} --> {data.dtype}") + write_header(name, data) + data.tofile(fout) continue # ftype == 0 -> float32, ftype == 1 -> float16 diff --git a/neural_speed/models/stablelm/stablelm_utils.cpp b/neural_speed/models/stablelm/stablelm_utils.cpp index e12877995..76cf04c34 100644 --- a/neural_speed/models/stablelm/stablelm_utils.cpp +++ b/neural_speed/models/stablelm/stablelm_utils.cpp @@ -73,6 +73,9 @@ void stablelm::init(const char* path_model, model_context* ctx, int n_gpu_layer_ n_embd = hparams.n_embd; n_vocab = hparams.n_vocab; n_layer = hparams.n_layer; + n_head = hparams.n_head; + n_head_kv = hparams.n_head_kv; + n_embd_head_k = hparams.n_embd_head_k; n_embd = hparams.n_embd; scratch = stablelm_mem_req(n_layer); model.scratchs = scratch; @@ -130,7 +133,7 @@ void stablelm::load(model_context* ctx, model_progress_callback progress_callbac layer.norm[1] = ml->get_tensor(layers_i + ".input_layernorm.bias", {n_embd}, backend); // qkv GEMM + out proj GEMM - if (ml->verify_tensor(layers_i + ".self_attn.q_proj.bias")) { // Stablelm2 1.6B & Stablelm2 Zephyr 1.6B + if (n_layer == 24) { // StableLM-2-1.6B & StableLM-2-Zephyr-1.6B layer.attn[0] = ml->get_tensor(layers_i + ".self_attn.q_proj.weight", {n_embd, n_embd}, backend); layer.attn[1] = ml->get_tensor(layers_i + ".self_attn.q_proj.bias", {n_embd}, backend); layer.attn[2] = ml->get_tensor(layers_i + ".self_attn.k_proj.weight", {n_embd, n_embd}, backend); @@ -138,17 +141,26 @@ void stablelm::load(model_context* ctx, model_progress_callback progress_callbac layer.attn[4] = ml->get_tensor(layers_i + ".self_attn.v_proj.weight", {n_embd, n_embd}, backend); layer.attn[5] = ml->get_tensor(layers_i + ".self_attn.v_proj.bias", {n_embd}, backend); layer.attn[6] = ml->get_tensor(layers_i + ".self_attn.o_proj.weight", {n_embd, n_embd}, backend); - } else { // Stablelm 3B + } else if (n_layer == 32) { // StableLM-3B & Stable-Code-3B layer.attn[0] = ml->get_tensor(layers_i + ".self_attn.q_proj.weight", {n_embd, n_embd}, backend); layer.attn[1] = ml->get_tensor(layers_i + ".self_attn.k_proj.weight", {n_embd, n_embd}, backend); layer.attn[2] = ml->get_tensor(layers_i + ".self_attn.v_proj.weight", {n_embd, n_embd}, backend); layer.attn[3] = ml->get_tensor(layers_i + ".self_attn.o_proj.weight", {n_embd, n_embd}, backend); + } else if (n_layer == 40) { // StableLM-2-12B + layer.attn[0] = ml->get_tensor(layers_i + ".self_attn.q_proj.weight", {n_embd, n_embd}, backend); + layer.attn[1] = ml->get_tensor(layers_i + ".self_attn.k_proj.weight", {n_embd, n_embd}, backend); + layer.attn[2] = ml->get_tensor(layers_i + ".self_attn.v_proj.weight", {n_embd, n_embd}, backend); + layer.attn[3] = ml->get_tensor(layers_i + ".self_attn.o_proj.weight", {n_embd, n_embd}, backend); + layer.attn[4] = ml->get_tensor(layers_i + ".self_attn.q_layernorm.weight", {n_embd_head_k, n_head}, backend); + layer.attn[5] = ml->get_tensor(layers_i + ".self_attn.k_layernorm.weight", {n_embd_head_k, n_head_kv}, backend); + } + + // Post Attention norm - Only present in 1.6B & 3B + if (n_layer < 40) { + layer.norm[2] = ml->get_tensor(layers_i + ".post_attention_layernorm.weight", {n_embd}, backend); + layer.norm[3] = ml->get_tensor(layers_i + ".post_attention_layernorm.bias", {n_embd}, backend); } - // Post Attention norm - layer.norm[2] = ml->get_tensor(layers_i + ".post_attention_layernorm.weight", {n_embd}, backend); - layer.norm[3] = ml->get_tensor(layers_i + ".post_attention_layernorm.bias", {n_embd}, backend); - // ffn GEMM layer.ffn[0] = ml->get_tensor(layers_i + ".mlp.gate_proj.weight", {n_embd, n_ff}, backend); layer.ffn[1] = ml->get_tensor(layers_i + ".mlp.down_proj.weight", {n_ff, n_embd}, backend); From 442aa8935315949cecbce2a50fc20674ffc1317f Mon Sep 17 00:00:00 2001 From: Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> Date: Mon, 6 May 2024 18:54:30 +0200 Subject: [PATCH 05/14] Minor fixes --- neural_speed/convert/convert_stablelm.py | 16 ++++++++-------- neural_speed/models/stablelm/stablelm.cpp | 5 +++-- neural_speed/models/stablelm/stablelm_utils.cpp | 9 +++++++-- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/neural_speed/convert/convert_stablelm.py b/neural_speed/convert/convert_stablelm.py index 1b3b97a7c..8bf8c3893 100644 --- a/neural_speed/convert/convert_stablelm.py +++ b/neural_speed/convert/convert_stablelm.py @@ -84,7 +84,7 @@ def stablelm_convert_gguf(model, tokenizer, dir_model, fname_out, ftype, hparams print("stablelm.gguf converting: ") list_vars = model.state_dict() n_head = hparams["num_attention_heads"] - n_kv_head = hparams["num_key_value_heads"] + n_head_kv = hparams["num_key_value_heads"] block_count = hparams["num_hidden_layers"] n_rot = int(hparams["partial_rotary_factor"] * hparams["hidden_size"] / hparams["num_attention_heads"]) for name in list_vars.keys(): @@ -100,7 +100,7 @@ def stablelm_convert_gguf(model, tokenizer, dir_model, fname_out, ftype, hparams gguf_writer.add_uint32('n_vocab', hparams["vocab_size"]) gguf_writer.add_embedding_length(hparams["hidden_size"]) gguf_writer.add_head_count(n_head) - gguf_writer.add_head_count_kv(n_kv_head) + gguf_writer.add_head_count_kv(n_head_kv) gguf_writer.add_block_count(block_count) gguf_writer.add_rope_dimension_count(n_rot) @@ -173,9 +173,9 @@ def write_vocab_gguf(dir_model, hparams, gguf_writer): continue if name.find("k_layernorm.norms") != -1: k_norms[name] = data - if len(k_norms) >= (block_count * n_kv_head): + if len(k_norms) >= (block_count * n_head_kv): for block in range(block_count): - name, data = stack_qk_norm(block, name, n_kv_head, k_norms, n_dims, ftype, layer_name="k_layernorm") + name, data = stack_qk_norm(block, name, n_head_kv, k_norms, n_dims, ftype, layer_name="k_layernorm") print(f"Processing variable {name} with shape {data.shape}, {old_dtype} --> {data.dtype}") gguf_writer.add_tensor(name, data) continue @@ -212,7 +212,7 @@ def stablelm_convert(model, tokenizer, dir_model, fname_out, ftype, hparams): print("stablelm ne converting: ") list_vars = model.state_dict() n_head = hparams["num_attention_heads"] - n_kv_head = hparams["num_key_value_heads"] + n_head_kv = hparams["num_key_value_heads"] block_count = hparams["num_hidden_layers"] vocab_size = hparams["vocab_size"] n_rot = int(hparams["partial_rotary_factor"] * hparams["hidden_size"] / hparams["num_attention_heads"]) @@ -232,7 +232,7 @@ def stablelm_convert(model, tokenizer, dir_model, fname_out, ftype, hparams): fout.write(struct.pack("i", hparams["hidden_size"])) fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", n_head)) - fout.write(struct.pack("i", n_kv_head)) # multi-query attention + fout.write(struct.pack("i", n_head_kv)) # multi-query attention fout.write(struct.pack("i", hparams["num_hidden_layers"])) fout.write(struct.pack("i", n_rot)) fout.write(struct.pack("i", ftype)) @@ -309,9 +309,9 @@ def write_header(name, data): continue if name.find("k_layernorm.norms") != -1: k_norms[name] = data - if len(k_norms) >= (block_count * n_kv_head): + if len(k_norms) >= (block_count * n_head_kv): for block in range(block_count): - name, data = stack_qk_norm(block, name, n_kv_head, k_norms, n_dims, ftype, layer_name="k_layernorm") + name, data = stack_qk_norm(block, name, n_head_kv, k_norms, n_dims, ftype, layer_name="k_layernorm") print(f"Processing variable {name} with shape {data.shape}, {old_dtype} --> {data.dtype}") write_header(name, data) data.tofile(fout) diff --git a/neural_speed/models/stablelm/stablelm.cpp b/neural_speed/models/stablelm/stablelm.cpp index 4b0dc9935..48522cdc3 100644 --- a/neural_speed/models/stablelm/stablelm.cpp +++ b/neural_speed/models/stablelm/stablelm.cpp @@ -74,6 +74,7 @@ static bool stablelm_model_eval_internal(model_context* ctx, const model_input* const int n_ctx = lctx.n_ctx; const int n_keep = lctx.n_keep; const int n_head = hparams.n_head; + const int n_head_kv = hparams.n_head_kv; const int n_vocab = hparams.n_vocab; const int n_rot = hparams.n_rot; const int head_dim = n_embd / n_head; @@ -143,7 +144,7 @@ static bool stablelm_model_eval_internal(model_context* ctx, const model_input* struct ne_tensor* Qcur; struct ne_tensor* Kcur; struct ne_tensor* Vcur; - if (n_layer == 24) { // Stablelm2 1.6B & Stablelm2 Zephyr 1.6B + if (n_layer == 24) { // StableLM-2-1.6B & StableLM-2-Zephyr-1.6B Qcur = ne_reshape_4d(ctx0, ne_add(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[0], cur), model.layers[il].attn[1]), head_dim, n_head, N, 1); @@ -155,7 +156,7 @@ static bool stablelm_model_eval_internal(model_context* ctx, const model_input* head_dim, n_head, N, 1); } else { // Stablelm 3B Qcur = ne_reshape_4d(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[0], cur), head_dim, n_head, N, 1); - Kcur = ne_reshape_4d(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[1], cur), head_dim, n_head, N, 1); + Kcur = ne_reshape_4d(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[1], cur), head_dim, n_head_kv, N, 1); Vcur = ne_reshape_4d(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[2], cur), head_dim, n_head, N, 1); } diff --git a/neural_speed/models/stablelm/stablelm_utils.cpp b/neural_speed/models/stablelm/stablelm_utils.cpp index 76cf04c34..c8cda7342 100644 --- a/neural_speed/models/stablelm/stablelm_utils.cpp +++ b/neural_speed/models/stablelm/stablelm_utils.cpp @@ -167,17 +167,22 @@ void stablelm::load(model_context* ctx, model_progress_callback progress_callbac layer.ffn[2] = ml->get_tensor(layers_i + ".mlp.up_proj.weight", {n_embd, n_ff}, backend); if (backend != NE_BACKEND_CPU) { - if (ml->verify_tensor(layers_i + ".self_attn.q_proj.bias")) { + if (n_layer == 24) { vram_total += ne_nbytes(layer.norm[0]) + ne_nbytes(layer.norm[1]) + ne_nbytes(layer.norm[2]) + ne_nbytes(layer.norm[3]) + ne_nbytes(layer.attn[0]) + ne_nbytes(layer.attn[1]) + ne_nbytes(layer.attn[2]) + ne_nbytes(layer.attn[3]) + ne_nbytes(layer.attn[4]) + ne_nbytes(layer.attn[5]) + ne_nbytes(layer.attn[6]) + ne_nbytes(layer.ffn[0]) + ne_nbytes(layer.ffn[1]) + ne_nbytes(layer.ffn[2]); - } else { + } else if (n_layer == 32) { vram_total += ne_nbytes(layer.norm[0]) + ne_nbytes(layer.norm[1]) + ne_nbytes(layer.norm[2]) + ne_nbytes(layer.norm[3]) + ne_nbytes(layer.attn[0]) + ne_nbytes(layer.attn[1]) + ne_nbytes(layer.attn[2]) + ne_nbytes(layer.attn[3]) + ne_nbytes(layer.ffn[0]) + ne_nbytes(layer.ffn[1]) + ne_nbytes(layer.ffn[2]); + } else if (n_layer == 40) { + vram_total += ne_nbytes(layer.norm[0]) + ne_nbytes(layer.norm[1]) + ne_nbytes(layer.attn[0]) + + ne_nbytes(layer.attn[1]) + ne_nbytes(layer.attn[2]) + ne_nbytes(layer.attn[3]) + + ne_nbytes(layer.attn[4]) + ne_nbytes(layer.attn[5]) + ne_nbytes(layer.ffn[0]) + + ne_nbytes(layer.ffn[1]) + ne_nbytes(layer.ffn[2]); } } } From c5c35b8088ec29344facb506b0483b4835a24f51 Mon Sep 17 00:00:00 2001 From: Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> Date: Tue, 7 May 2024 16:21:07 +0200 Subject: [PATCH 06/14] Add inference for stablelm-2-12b --- neural_speed/models/stablelm/stablelm.cpp | 77 ++++++++++++++--------- neural_speed/models/stablelm/stablelm.h | 4 +- 2 files changed, 48 insertions(+), 33 deletions(-) diff --git a/neural_speed/models/stablelm/stablelm.cpp b/neural_speed/models/stablelm/stablelm.cpp index 48522cdc3..3c3497811 100644 --- a/neural_speed/models/stablelm/stablelm.cpp +++ b/neural_speed/models/stablelm/stablelm.cpp @@ -102,7 +102,7 @@ static bool stablelm_model_eval_internal(model_context* ctx, const model_input* attn_shape_t attn_shape = { /* .batch_size = */ 1, /* .head_num = */ n_head, - /* .heads_kv = */ n_head, + /* .heads_kv = */ n_head_kv, /* .head_size = */ head_dim, /* .sl_q = */ N, // Note: make sure that bestla reordered attn supports next token inference /* .sl_kv = */ n_past + N, @@ -111,7 +111,7 @@ static bool stablelm_model_eval_internal(model_context* ctx, const model_input* NE_ASSERT(("bestla managed kv-cache not supported; use `--memory-f16 / --memory-f32` instead", bestla_reordered_attn_fp32_support(&attn_shape))); kv_shape_t kv_shape{ - /* .heads_kv = */ static_cast(n_head), + /* .heads_kv = */ static_cast(n_head_kv), /* .head_size = */ static_cast(head_dim), /* .sl_kv_max = */ static_cast(n_ctx), }; @@ -124,6 +124,7 @@ static bool stablelm_model_eval_internal(model_context* ctx, const model_input* } struct ne_tensor* inpL = ne_get_rows(ctx0, model.others[0], embd); + struct ne_tensor* inpPA; for (int il = 0; il < n_layer; ++il) { struct ne_tensor* cur; @@ -134,12 +135,14 @@ static bool stablelm_model_eval_internal(model_context* ctx, const model_input* // layer_norm { cur = ne_norm(ctx0, inpL, hparams.norm_eps); - // cur = cur*attention_norm(broadcasted) cur = ne_mul(ctx0, cur, model.layers[il].norm[0]); cur = ne_add(ctx0, cur, model.layers[il].norm[1]); } + // Store for parallel MLP layer + inpPA = cur; + // Compute QKV struct ne_tensor* Qcur; struct ne_tensor* Kcur; @@ -150,14 +153,22 @@ static bool stablelm_model_eval_internal(model_context* ctx, const model_input* head_dim, n_head, N, 1); Kcur = ne_reshape_4d(ctx0, ne_add(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[2], cur), model.layers[il].attn[3]), - head_dim, n_head, N, 1); + head_dim, n_head_kv, N, 1); Vcur = ne_reshape_4d(ctx0, ne_add(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[4], cur), model.layers[il].attn[5]), - head_dim, n_head, N, 1); - } else { // Stablelm 3B + head_dim, n_head_kv, N, 1); + } else if (n_layer == 32) { // StableLM-3B & Stable-Code-3B Qcur = ne_reshape_4d(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[0], cur), head_dim, n_head, N, 1); Kcur = ne_reshape_4d(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[1], cur), head_dim, n_head_kv, N, 1); - Vcur = ne_reshape_4d(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[2], cur), head_dim, n_head, N, 1); + Vcur = ne_reshape_4d(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[2], cur), head_dim, n_head_kv, N, 1); + } else if (n_layer == 40) { // StableLM-2-12B + Qcur = ne_reshape_4d(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[0], cur), head_dim, n_head, N, 1); + Qcur = ne_norm(ctx0, Qcur, hparams.norm_eps); + Qcur = ne_mul(ctx0, Qcur, model.layers[il].attn[4]); + Kcur = ne_reshape_4d(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[1], cur), head_dim, n_head_kv, N, 1); + Kcur = ne_norm(ctx0, Kcur, hparams.norm_eps); + Kcur = ne_mul(ctx0, Kcur, model.layers[il].attn[5]); + Vcur = ne_reshape_4d(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[2], cur), head_dim, n_head_kv, N, 1); } // using mode = 2 for GPT-NeoX mode @@ -167,7 +178,7 @@ static bool stablelm_model_eval_internal(model_context* ctx, const model_input* ne_build_forward_expand(&gf, Qcur_Part); ne_set_name(Qcur, "Qcur"); - struct ne_tensor* Kcur_Part = ne_view_4d(ctx0, ne_permute(ctx0, Kcur, 0, 2, 1, 3), n_rot, n_head, N, 1, + struct ne_tensor* Kcur_Part = ne_view_4d(ctx0, ne_permute(ctx0, Kcur, 0, 2, 1, 3), n_rot, n_head_kv, N, 1, Kcur->nb[1], Kcur->nb[2], Kcur->nb[3], 0); Kcur_Part = ne_rope_inplace(ctx0, Kcur_Part, n_past, n_rot, 2, 0, hparams.freq_base, hparams.freq_scale); ne_build_forward_expand(&gf, Kcur_Part); @@ -185,14 +196,14 @@ static bool stablelm_model_eval_internal(model_context* ctx, const model_input* for (int i = 0; i < batch_size; ++i) { // batch K Kcur_bs[i] = ne_permute(ctx0, - ne_view_4d(ctx0, Kcur, head_dim, n_head, N, 1, ne_element_size(Kcur) * head_dim, + ne_view_4d(ctx0, Kcur, head_dim, n_head_kv, N, 1, ne_element_size(Kcur) * head_dim, ne_element_size(Kcur) * n_embd, ne_element_size(Kcur) * n_embd * N, i * ne_element_size(Kcur) * n_embd * N), 0, 2, 1, 3); Kcur_temp = Kcur_bs[i]; ne_set_name(Kcur_bs[i], "kcur_bs"); k_bs[i] = ne_view_4d( - ctx0, kv_self.k, head_dim, N, n_head, 1, ne_element_size(kv_self.k) * head_dim, + ctx0, kv_self.k, head_dim, N, n_head_kv, 1, ne_element_size(kv_self.k) * head_dim, ne_element_size(kv_self.k) * head_dim * n_ctx, ne_element_size(kv_self.k) * n_embd * n_ctx, ((il * n_ctx) * ne_element_size(kv_self.k) * n_embd * kv_n_ctx_block + i * n_ctx * n_embd * ne_element_size(kv_self.k) + head_dim * n_past * ne_element_size(kv_self.k))); @@ -202,10 +213,10 @@ static bool stablelm_model_eval_internal(model_context* ctx, const model_input* ne_reshape_4d(ctx0, ne_view_2d(ctx0, Vcur, n_embd, N, ne_element_size(Vcur) * n_embd, i * ne_element_size(Vcur) * n_embd * N), - head_dim, n_head, N, 1), + head_dim, n_head_kv, N, 1), 1, 2, 0, 3); v_bs[i] = - ne_view_4d(ctx0, kv_self.v, N, head_dim, n_head, 1, n_ctx * ne_element_size(kv_self.v), + ne_view_4d(ctx0, kv_self.v, N, head_dim, n_head_kv, 1, n_ctx * ne_element_size(kv_self.v), n_ctx * ne_element_size(kv_self.v) * head_dim, n_ctx * ne_element_size(kv_self.v) * n_embd, ((il * n_ctx) * ne_element_size(kv_self.v) * n_embd * kv_n_ctx_block + i * n_ctx * n_embd * ne_element_size(kv_self.v) + n_past * ne_element_size(kv_self.v))); @@ -218,7 +229,7 @@ static bool stablelm_model_eval_internal(model_context* ctx, const model_input* ne_set_name(Q, "Q"); // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) struct ne_tensor* K = - ne_view_4d(ctx0, kv_self.k, head_dim, n_past + N, n_head, batch_size, ne_element_size(kv_self.k) * head_dim, + ne_view_4d(ctx0, kv_self.k, head_dim, n_past + N, n_head_kv, batch_size, ne_element_size(kv_self.k) * head_dim, ne_element_size(kv_self.k) * head_dim * n_ctx, ne_element_size(kv_self.k) * n_embd * n_ctx, il * n_ctx * ne_element_size(kv_self.k) * n_embd * kv_n_ctx_block); ne_set_name(K, "K"); @@ -227,7 +238,7 @@ static bool stablelm_model_eval_internal(model_context* ctx, const model_input* // KQ_scaled = KQ / sqrt(n_embd/n_head) struct ne_tensor* KQ_scaled = - ne_scale_inplace(ctx0, KQ, ne_new_f32(ctx0, 1.0f / sqrt(static_cast((n_embd) / n_head)))); + ne_scale_inplace(ctx0, KQ, ne_new_f32(ctx0, 1.0f / sqrt(static_cast(head_dim)))); // KQ_masked = mask_past(KQ_scaled) struct ne_tensor* KQ_masked = ne_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); @@ -237,7 +248,7 @@ static bool stablelm_model_eval_internal(model_context* ctx, const model_input* // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() struct ne_tensor* V = - ne_view_4d(ctx0, kv_self.v, n_past + N, head_dim, n_head, batch_size, n_ctx * ne_element_size(kv_self.v), + ne_view_4d(ctx0, kv_self.v, n_past + N, head_dim, n_head_kv, batch_size, n_ctx * ne_element_size(kv_self.v), n_ctx * ne_element_size(kv_self.v) * head_dim, n_ctx * ne_element_size(kv_self.v) * n_embd, il * n_ctx * ne_element_size(kv_self.v) * n_embd * kv_n_ctx_block); @@ -256,15 +267,15 @@ static bool stablelm_model_eval_internal(model_context* ctx, const model_input* // store key and value to memory { - const auto k_cache = ne_view_3d(ctx0, kv_self.k, // tensor - head_dim, n_ctx, n_head, // ne - 0, 0, // nb (bestla managed) - il * k_size); // offset + const auto k_cache = ne_view_3d(ctx0, kv_self.k, // tensor + head_dim, n_ctx, n_head_kv, // ne + 0, 0, // nb (bestla managed) + il * k_size); // offset ne_build_forward_expand(&gf, ne_flash_attn_update_k(ctx0, k_cache, Kcur, n_past, false)); - const auto v_cache = ne_view_3d(ctx0, kv_self.v, // tensor - head_dim, n_ctx, n_head, // ne - 0, 0, // nb (bestla managed) - il * v_size); // offset + const auto v_cache = ne_view_3d(ctx0, kv_self.v, // tensor + head_dim, n_ctx, n_head_kv, // ne + 0, 0, // nb (bestla managed) + il * v_size); // offset ne_build_forward_expand(&gf, ne_flash_attn_update_v(ctx0, v_cache, Vcur, n_past, false)); } @@ -273,14 +284,14 @@ static bool stablelm_model_eval_internal(model_context* ctx, const model_input* struct ne_tensor* K = ne_view_3d(ctx0, kv_self.k, // tensor - head_dim, seq_kv, n_head, // ne + head_dim, seq_kv, n_head_kv, // ne kv_cache_info.stride_k_sl, kv_cache_info.stride_k_head_num, // nb (bestla managed) il * k_size); // offset *reinterpret_cast(&K->nb[0]) = kv_cache_info.k_layout; // us nb0 for layout ne_set_name(K, "K"); struct ne_tensor* V = ne_view_3d(ctx0, kv_self.v, // tensor - seq_kv, head_dim, n_head, // ne + seq_kv, head_dim, n_head_kv, // ne kv_cache_info.stride_v_head_size, kv_cache_info.stride_v_head_num, // nb (bestla managed) il * v_size); // offset *reinterpret_cast(&V->nb[0]) = kv_cache_info.v_layout; // us nb0 for layout @@ -294,15 +305,15 @@ static bool stablelm_model_eval_internal(model_context* ctx, const model_input* // projection { - if (n_layer == 24) { // Stablelm2 1.6B & Stablelm2 Zephyr 1.6B + if (n_layer == 24) { // StableLM-2-1.6B & StableLM-2-Zephyr-1.6B cur = ne_mul_mat(ctx0, model.layers[il].attn[6], cur); - } else { // Stablelm 3B + } else { // StableLM-3B & Stable-Code-3B & StableLM-2-12B cur = ne_mul_mat(ctx0, model.layers[il].attn[3], cur); } } } lctx.use_buf(ctx0, 1); - + cur = ne_add(ctx0, cur, inpL); inpL = cur; @@ -310,9 +321,13 @@ static bool stablelm_model_eval_internal(model_context* ctx, const model_input* { // Post Attention norm { - cur = ne_norm(ctx0, cur, hparams.norm_eps); - cur = ne_mul(ctx0, cur, model.layers[il].norm[2]); - cur = ne_add(ctx0, cur, model.layers[il].norm[3]); + if (n_layer < 40) { + cur = ne_norm(ctx0, cur, hparams.norm_eps); + cur = ne_mul(ctx0, cur, model.layers[il].norm[2]); + cur = ne_add(ctx0, cur, model.layers[il].norm[3]); + } else { + cur = inpPA; // Parallel FFN + } } if (bestla_fusion_FFN_SiLu_f32f32_support(model.layers[il].ffn[0]->data, model.layers[il].ffn[1]->data, diff --git a/neural_speed/models/stablelm/stablelm.h b/neural_speed/models/stablelm/stablelm.h index 1dcd028fb..9cfe4f102 100644 --- a/neural_speed/models/stablelm/stablelm.h +++ b/neural_speed/models/stablelm/stablelm.h @@ -25,7 +25,7 @@ enum stablelm_model { STABLELM_3B, }; -static const model_scratch stablelm_mem_req(int n_layers) { +static const model_scratch stablelm_mem_req(int n_layers, float scratch_size_ratio = 1.0f) { switch (n_layers) { case 24: // StableLM-2-1.6B & StableLM-2-Zephyr-1.6B return { @@ -54,7 +54,7 @@ class stablelm : public IModel { private: model_archs name = MODEL_STABLELM; std::unique_ptr ml; - uint32_t n_layer, n_embd, n_ff, n_vocab; + uint32_t n_layer, n_embd, n_ff, n_vocab, n_head, n_head_kv, n_embd_head_k; int n_ctx, n_gpu_layer; bool use_mmap, use_mlock, vocab_only; model_scratch scratch; From a9be06c6e2ec170a2fcbf0d1bcfff8577f8d747d Mon Sep 17 00:00:00 2001 From: Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> Date: Tue, 7 May 2024 19:06:08 +0200 Subject: [PATCH 07/14] Fix conversion issues --- neural_speed/convert/convert_stablelm.py | 14 ++++++++------ neural_speed/models/stablelm/stablelm_utils.cpp | 8 ++++---- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/neural_speed/convert/convert_stablelm.py b/neural_speed/convert/convert_stablelm.py index 8bf8c3893..727be416e 100644 --- a/neural_speed/convert/convert_stablelm.py +++ b/neural_speed/convert/convert_stablelm.py @@ -275,14 +275,14 @@ def stablelm_convert(model, tokenizer, dir_model, fname_out, ftype, hparams): fout.write(text) fout.write(struct.pack("f", -10000)) - def write_header(name, data): - tmp = name.encode('utf-8') + def write_header(name, data, ftype=0): + str = name.encode('utf-8') n_dims = len(data.shape) - fout.write(struct.pack("iii", n_dims, len(tmp), ftype)) + fout.write(struct.pack("iii", n_dims, len(str), ftype)) for i in range(n_dims): fout.write(struct.pack("i", data.shape[n_dims - 1 - i])) - print(tmp) - fout.write(tmp) + print(str) + fout.write(str) print(hparams) q_norms, k_norms = dict(), dict() @@ -318,10 +318,12 @@ def write_header(name, data): continue # ftype == 0 -> float32, ftype == 1 -> float16 + ftype_cur = 0 if ftype != 0: if name.endswith(".weight") and not name.endswith("_norm.weight") and n_dims == 2: print(" Converting to float16") data = data.astype(np.float16) + ftype_cur = 1 else: print(" Converting to float32") data = data.astype(np.float32) @@ -331,7 +333,7 @@ def write_header(name, data): data = data.astype(np.float32) # header - write_header(name, data) + write_header(name, data, ftype_cur) # data data.tofile(fout) diff --git a/neural_speed/models/stablelm/stablelm_utils.cpp b/neural_speed/models/stablelm/stablelm_utils.cpp index c8cda7342..9bd2ac2a2 100644 --- a/neural_speed/models/stablelm/stablelm_utils.cpp +++ b/neural_speed/models/stablelm/stablelm_utils.cpp @@ -147,10 +147,10 @@ void stablelm::load(model_context* ctx, model_progress_callback progress_callbac layer.attn[2] = ml->get_tensor(layers_i + ".self_attn.v_proj.weight", {n_embd, n_embd}, backend); layer.attn[3] = ml->get_tensor(layers_i + ".self_attn.o_proj.weight", {n_embd, n_embd}, backend); } else if (n_layer == 40) { // StableLM-2-12B - layer.attn[0] = ml->get_tensor(layers_i + ".self_attn.q_proj.weight", {n_embd, n_embd}, backend); - layer.attn[1] = ml->get_tensor(layers_i + ".self_attn.k_proj.weight", {n_embd, n_embd}, backend); - layer.attn[2] = ml->get_tensor(layers_i + ".self_attn.v_proj.weight", {n_embd, n_embd}, backend); - layer.attn[3] = ml->get_tensor(layers_i + ".self_attn.o_proj.weight", {n_embd, n_embd}, backend); + layer.attn[0] = ml->get_tensor(layers_i + ".self_attn.q_proj.weight", {n_embd, n_embd_head_k * n_head}, backend); + layer.attn[1] = ml->get_tensor(layers_i + ".self_attn.k_proj.weight", {n_embd, n_embd_head_k * n_head_kv}, backend); + layer.attn[2] = ml->get_tensor(layers_i + ".self_attn.v_proj.weight", {n_embd, n_embd_head_k * n_head_kv}, backend); + layer.attn[3] = ml->get_tensor(layers_i + ".self_attn.o_proj.weight", {n_embd_head_k * n_head, n_embd}, backend); layer.attn[4] = ml->get_tensor(layers_i + ".self_attn.q_layernorm.weight", {n_embd_head_k, n_head}, backend); layer.attn[5] = ml->get_tensor(layers_i + ".self_attn.k_layernorm.weight", {n_embd_head_k, n_head_kv}, backend); } From c6e2fcaffa36671529ca5d9b3ce18e23339c9df5 Mon Sep 17 00:00:00 2001 From: Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> Date: Thu, 9 May 2024 18:03:08 +0200 Subject: [PATCH 08/14] Fix gibberish text issue --- neural_speed/models/stablelm/stablelm.cpp | 2 +- neural_speed/models/stablelm/stablelm_utils.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/neural_speed/models/stablelm/stablelm.cpp b/neural_speed/models/stablelm/stablelm.cpp index 3c3497811..19f36c480 100644 --- a/neural_speed/models/stablelm/stablelm.cpp +++ b/neural_speed/models/stablelm/stablelm.cpp @@ -124,10 +124,10 @@ static bool stablelm_model_eval_internal(model_context* ctx, const model_input* } struct ne_tensor* inpL = ne_get_rows(ctx0, model.others[0], embd); - struct ne_tensor* inpPA; for (int il = 0; il < n_layer; ++il) { struct ne_tensor* cur; + struct ne_tensor* inpPA; lctx.use_buf(ctx0, 0); diff --git a/neural_speed/models/stablelm/stablelm_utils.cpp b/neural_speed/models/stablelm/stablelm_utils.cpp index 9bd2ac2a2..76e4c9e8b 100644 --- a/neural_speed/models/stablelm/stablelm_utils.cpp +++ b/neural_speed/models/stablelm/stablelm_utils.cpp @@ -213,7 +213,7 @@ void stablelm::load(model_context* ctx, model_progress_callback progress_callbac class stablelm_quant_layer : public quant_layer_base { public: quant_params_internal get_layer_config(std::string layername, std::vector ne, ne_type type) override { - bool quantize = layername.rfind("weight") == layername.size() - 6; // ends with 'weight'? + bool quantize = (layername.rfind("weight") == layername.size() - 6) && (layername.find("layernorm") == std::string::npos); // ends with 'weight'? if (layername == "model.embed_tokens.weight") { // special layer process, can be loaded by config file return quant_params_internal(); // return q4_0 to cover the usage of getrow From 991d55b97de6b5855f90831eb8f616762ab7ad27 Mon Sep 17 00:00:00 2001 From: Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> Date: Fri, 10 May 2024 19:10:47 +0200 Subject: [PATCH 09/14] Remove redundant tensor allocation + Add supported models --- docs/supported_models.md | 22 ++- neural_speed/convert/convert_stablelm.py | 142 +----------------- neural_speed/models/stablelm/stablelm.cpp | 14 +- .../models/stablelm/stablelm_utils.cpp | 24 +-- 4 files changed, 32 insertions(+), 170 deletions(-) diff --git a/docs/supported_models.md b/docs/supported_models.md index ebc3b5d30..2222af282 100644 --- a/docs/supported_models.md +++ b/docs/supported_models.md @@ -290,8 +290,10 @@ Neural Speed supports the following models: StableLM-3B, - StableLM2-1_6B - StableLM2-Zephyr-1_6B + StableLM-2-1_6B, + StableLM-2-Zephyr-1_6B, + StableLM-2-12B, + StableLM-2-12B-Chat ✅ @@ -301,7 +303,7 @@ Neural Speed supports the following models: Latest - 2048 + 4096 gemma-2b-it , @@ -372,7 +374,7 @@ Neural Speed supports the following models: ✅ Latest - + Magicoder-6.7B ✅ ✅ @@ -398,6 +400,18 @@ Neural Speed supports the following models: Latest + + Stable-Code-3B + ✅ + + + + ✅ + + + + Latest + diff --git a/neural_speed/convert/convert_stablelm.py b/neural_speed/convert/convert_stablelm.py index 727be416e..cdc3caa2c 100644 --- a/neural_speed/convert/convert_stablelm.py +++ b/neural_speed/convert/convert_stablelm.py @@ -29,7 +29,6 @@ Union) from transformers import AutoModelForCausalLM, AutoTokenizer import torch -import gguf # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py def bytes_to_unicode(): @@ -80,134 +79,6 @@ def stack_qk_norm(block, name, n_head, norms, n_dims, ftype, layer_name="q_layer return merged_name, data -def stablelm_convert_gguf(model, tokenizer, dir_model, fname_out, ftype, hparams): - print("stablelm.gguf converting: ") - list_vars = model.state_dict() - n_head = hparams["num_attention_heads"] - n_head_kv = hparams["num_key_value_heads"] - block_count = hparams["num_hidden_layers"] - n_rot = int(hparams["partial_rotary_factor"] * hparams["hidden_size"] / hparams["num_attention_heads"]) - for name in list_vars.keys(): - print(name, list_vars[name].shape, list_vars[name].dtype) - - print(hparams) - - gguf_file = fname_out + '.gguf' if not fname_out.endswith(".gguf") else fname_out - gguf_writer = gguf.GGUFWriter(gguf_file, "stablelm") - - gguf_writer.add_uint32('magic', 0x67676d66) - gguf_writer.add_uint32('version', 1) - gguf_writer.add_uint32('n_vocab', hparams["vocab_size"]) - gguf_writer.add_embedding_length(hparams["hidden_size"]) - gguf_writer.add_head_count(n_head) - gguf_writer.add_head_count_kv(n_head_kv) - - gguf_writer.add_block_count(block_count) - gguf_writer.add_rope_dimension_count(n_rot) - gguf_writer.add_uint32('ftype', ftype) - gguf_writer.add_context_length(hparams["max_position_embeddings"]) - gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - - gguf_writer.add_bos_token_id(hparams["bos_token_id"]) - gguf_writer.add_eos_token_id(hparams["eos_token_id"]) - gguf_writer.add_pad_token_id(hparams["pad_token_id"] if hparams["pad_token_id"] else 0) - gguf_writer.add_sep_token_id(hparams["sep_token_id"] if hparams["sep_token_id"] else 0) - - def write_vocab_gguf(dir_model, hparams, gguf_writer): - tokens: list[bytearray] = [] - toktypes: list[int] = [] - - tokenizer = AutoTokenizer.from_pretrained(dir_model) - vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) - assert max(tokenizer.vocab.values()) < vocab_size - - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} - added_vocab = tokenizer.get_added_vocab() - - for i in range(vocab_size): - if i not in reverse_vocab: - pad_token = f"[PAD{i}]".encode('utf-8') - tokens.append(bytearray(pad_token)) - toktypes.append(gguf.TokenType.USER_DEFINED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - - gguf_writer.add_tokenizer_model("gpt2") - gguf_writer.add_token_list(tokens) - gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) - special_vocab.add_to_gguf(gguf_writer) - - write_vocab_gguf(dir_model, hparams, gguf_writer) - - # tensor info - print("gguf: get tensor metadata") - q_norms, k_norms = dict(), dict() - for name, data_torch in list_vars.items(): - # Convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - # Skip some tensors - if name.endswith((".attention.rotary_emb.inv_freq")): - continue - - data = data_torch.squeeze().numpy() - old_dtype = data.dtype - n_dims = len(data.shape) - if name.find("q_layernorm.norms") != -1: - q_norms[name] = data - if len(q_norms) >= (block_count * n_head): - for block in range(block_count): - name, data = stack_qk_norm(block, name, n_head, q_norms, n_dims, ftype, layer_name="q_layernorm") - print(f"Processing variable {name} with shape {data.shape}, {old_dtype} --> {data.dtype}") - gguf_writer.add_tensor(name, data) - continue - if name.find("k_layernorm.norms") != -1: - k_norms[name] = data - if len(k_norms) >= (block_count * n_head_kv): - for block in range(block_count): - name, data = stack_qk_norm(block, name, n_head_kv, k_norms, n_dims, ftype, layer_name="k_layernorm") - print(f"Processing variable {name} with shape {data.shape}, {old_dtype} --> {data.dtype}") - gguf_writer.add_tensor(name, data) - continue - - # ftype == 0 -> float32, ftype == 1 -> float16 - if ftype != 0: - if name.endswith(".weight") and not name.endswith("_norm.weight") and n_dims == 2: - print(" Converting to float16") - data = data.astype(np.float16) - else: - print(" Converting to float32") - data = data.astype(np.float32) - else: - if data.dtype != np.float32: - print(" Converting to float32") - data = data.astype(np.float32) - - print(f"Processing variable {name} with shape {data.shape}, {old_dtype} --> {data.dtype}") - gguf_writer.add_tensor(name, data) - - print("gguf: write header") - gguf_writer.write_header_to_file() - print("gguf: write metadata") - gguf_writer.write_kv_data_to_file() - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - - gguf_writer.close() - - print("Done. Output file: " + gguf_file) - print("") - def stablelm_convert(model, tokenizer, dir_model, fname_out, ftype, hparams): print("stablelm ne converting: ") list_vars = model.state_dict() @@ -361,13 +232,6 @@ def main(args_in: Optional[List[str]] = None) -> None: default="huggingface", help="hub to load model" ) - parser.add_argument( - "--format", - type=str, - default="NE", - choices=["NE", "GGUF"], - help="convert to the GGUF or NE format" - ) parser.add_argument( "model", type=Path, @@ -394,11 +258,7 @@ def main(args_in: Optional[List[str]] = None) -> None: print("Loading model: ", dir_model) model = AutoModelForCausalLM.from_pretrained(dir_model, trust_remote_code=True) hparams = model.config.to_dict() - if args.format == "GGUF": - stablelm_convert_gguf(model, tokenizer, dir_model, fname_out, ftype, hparams) - else: - stablelm_convert(model, tokenizer, dir_model, fname_out, ftype, hparams) - + stablelm_convert(model, tokenizer, dir_model, fname_out, ftype, hparams) if __name__ == '__main__': diff --git a/neural_speed/models/stablelm/stablelm.cpp b/neural_speed/models/stablelm/stablelm.cpp index 19f36c480..04a64d56d 100644 --- a/neural_speed/models/stablelm/stablelm.cpp +++ b/neural_speed/models/stablelm/stablelm.cpp @@ -149,13 +149,13 @@ static bool stablelm_model_eval_internal(model_context* ctx, const model_input* struct ne_tensor* Vcur; if (n_layer == 24) { // StableLM-2-1.6B & StableLM-2-Zephyr-1.6B Qcur = - ne_reshape_4d(ctx0, ne_add(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[0], cur), model.layers[il].attn[1]), + ne_reshape_4d(ctx0, ne_add(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[0], cur), model.layers[il].attn[4]), head_dim, n_head, N, 1); Kcur = - ne_reshape_4d(ctx0, ne_add(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[2], cur), model.layers[il].attn[3]), + ne_reshape_4d(ctx0, ne_add(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[1], cur), model.layers[il].attn[5]), head_dim, n_head_kv, N, 1); Vcur = - ne_reshape_4d(ctx0, ne_add(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[4], cur), model.layers[il].attn[5]), + ne_reshape_4d(ctx0, ne_add(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[2], cur), model.layers[il].attn[6]), head_dim, n_head_kv, N, 1); } else if (n_layer == 32) { // StableLM-3B & Stable-Code-3B Qcur = ne_reshape_4d(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[0], cur), head_dim, n_head, N, 1); @@ -303,13 +303,9 @@ static bool stablelm_model_eval_internal(model_context* ctx, const model_input* cur = ne_view_2d(ctx0, KQV_Out, n_embd, N, n_embd * ne_element_size(KQV_Out), 0); } - // projection + // out projection gemm { - if (n_layer == 24) { // StableLM-2-1.6B & StableLM-2-Zephyr-1.6B - cur = ne_mul_mat(ctx0, model.layers[il].attn[6], cur); - } else { // StableLM-3B & Stable-Code-3B & StableLM-2-12B - cur = ne_mul_mat(ctx0, model.layers[il].attn[3], cur); - } + cur = ne_mul_mat(ctx0, model.layers[il].attn[3], cur); } } lctx.use_buf(ctx0, 1); diff --git a/neural_speed/models/stablelm/stablelm_utils.cpp b/neural_speed/models/stablelm/stablelm_utils.cpp index 76e4c9e8b..6c258db56 100644 --- a/neural_speed/models/stablelm/stablelm_utils.cpp +++ b/neural_speed/models/stablelm/stablelm_utils.cpp @@ -133,24 +133,16 @@ void stablelm::load(model_context* ctx, model_progress_callback progress_callbac layer.norm[1] = ml->get_tensor(layers_i + ".input_layernorm.bias", {n_embd}, backend); // qkv GEMM + out proj GEMM + layer.attn[0] = ml->get_tensor(layers_i + ".self_attn.q_proj.weight", {n_embd, n_embd_head_k * n_head}, backend); + layer.attn[1] = ml->get_tensor(layers_i + ".self_attn.k_proj.weight", {n_embd, n_embd_head_k * n_head_kv}, backend); + layer.attn[2] = ml->get_tensor(layers_i + ".self_attn.v_proj.weight", {n_embd, n_embd_head_k * n_head_kv}, backend); + layer.attn[3] = ml->get_tensor(layers_i + ".self_attn.o_proj.weight", {n_embd_head_k * n_head, n_embd}, backend); + if (n_layer == 24) { // StableLM-2-1.6B & StableLM-2-Zephyr-1.6B - layer.attn[0] = ml->get_tensor(layers_i + ".self_attn.q_proj.weight", {n_embd, n_embd}, backend); - layer.attn[1] = ml->get_tensor(layers_i + ".self_attn.q_proj.bias", {n_embd}, backend); - layer.attn[2] = ml->get_tensor(layers_i + ".self_attn.k_proj.weight", {n_embd, n_embd}, backend); - layer.attn[3] = ml->get_tensor(layers_i + ".self_attn.k_proj.bias", {n_embd}, backend); - layer.attn[4] = ml->get_tensor(layers_i + ".self_attn.v_proj.weight", {n_embd, n_embd}, backend); - layer.attn[5] = ml->get_tensor(layers_i + ".self_attn.v_proj.bias", {n_embd}, backend); - layer.attn[6] = ml->get_tensor(layers_i + ".self_attn.o_proj.weight", {n_embd, n_embd}, backend); - } else if (n_layer == 32) { // StableLM-3B & Stable-Code-3B - layer.attn[0] = ml->get_tensor(layers_i + ".self_attn.q_proj.weight", {n_embd, n_embd}, backend); - layer.attn[1] = ml->get_tensor(layers_i + ".self_attn.k_proj.weight", {n_embd, n_embd}, backend); - layer.attn[2] = ml->get_tensor(layers_i + ".self_attn.v_proj.weight", {n_embd, n_embd}, backend); - layer.attn[3] = ml->get_tensor(layers_i + ".self_attn.o_proj.weight", {n_embd, n_embd}, backend); + layer.attn[4] = ml->get_tensor(layers_i + ".self_attn.q_proj.bias", {n_embd}, backend); + layer.attn[5] = ml->get_tensor(layers_i + ".self_attn.k_proj.bias", {n_embd}, backend); + layer.attn[6] = ml->get_tensor(layers_i + ".self_attn.v_proj.bias", {n_embd}, backend); } else if (n_layer == 40) { // StableLM-2-12B - layer.attn[0] = ml->get_tensor(layers_i + ".self_attn.q_proj.weight", {n_embd, n_embd_head_k * n_head}, backend); - layer.attn[1] = ml->get_tensor(layers_i + ".self_attn.k_proj.weight", {n_embd, n_embd_head_k * n_head_kv}, backend); - layer.attn[2] = ml->get_tensor(layers_i + ".self_attn.v_proj.weight", {n_embd, n_embd_head_k * n_head_kv}, backend); - layer.attn[3] = ml->get_tensor(layers_i + ".self_attn.o_proj.weight", {n_embd_head_k * n_head, n_embd}, backend); layer.attn[4] = ml->get_tensor(layers_i + ".self_attn.q_layernorm.weight", {n_embd_head_k, n_head}, backend); layer.attn[5] = ml->get_tensor(layers_i + ".self_attn.k_layernorm.weight", {n_embd_head_k, n_head_kv}, backend); } From 80bf6dd035a061372e472e63457f78200aa53e1e Mon Sep 17 00:00:00 2001 From: Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> Date: Fri, 10 May 2024 19:38:37 +0200 Subject: [PATCH 10/14] Fix clang format --- bestla/bestla/kernel_avx2.h | 6 +- bestla/bestla/kernel_avx512f.h | 6 +- bestla/bestla/xbyak/xbyak.h | 4 +- neural_speed/core/ne_layers.c | 108 +++++------------- neural_speed/models/stablelm/stablelm.cpp | 30 +++-- neural_speed/models/stablelm/stablelm.h | 6 +- .../models/stablelm/stablelm_utils.cpp | 9 +- 7 files changed, 59 insertions(+), 110 deletions(-) diff --git a/bestla/bestla/kernel_avx2.h b/bestla/bestla/kernel_avx2.h index 8856010b6..f3ec4b50f 100644 --- a/bestla/bestla/kernel_avx2.h +++ b/bestla/bestla/kernel_avx2.h @@ -690,7 +690,7 @@ template inline BTLA_CODE decompress_kblock_s4_s8(utils::int4x2* srcptr, int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int col, int8_t* tmp, size_t tmpsize) { if (zpptr) { - typedef BTLA_CODE (*decompfunc)(utils::int4x2 * srcptr, int8_t * zpptr, int8_t * dstptr, int blocksize, int ldzp, + typedef BTLA_CODE (*decompfunc)(utils::int4x2* srcptr, int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int8_t* tmp, size_t tmpsize); decompfunc func = nullptr; if (col == NTILE) { @@ -763,7 +763,7 @@ static inline BTLA_CODE decompress_kblock_s2_s8(utils::bit2x4* bit2ptr, int8_t* int ldzp, int n_offset, int k_offset, int row, int col, int8_t* tmp, size_t tmpsize) { if (zpptr) { - typedef BTLA_CODE (*decompfunc)(utils::bit2x4 * srcptr, int8_t * zpptr, int8_t * dstptr, int blocksize, int ldzp, + typedef BTLA_CODE (*decompfunc)(utils::bit2x4* srcptr, int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int8_t* tmp, size_t tmpsize); decompfunc func = nullptr; if (col == NTILE) { @@ -1021,7 +1021,7 @@ static inline BTLA_CODE decompress_kblock_s3_s8(utils::bit2x4* bit2ptr, utils::b int8_t* dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int col, int8_t* tmp, size_t tmpsize) { if (zpptr) { - typedef BTLA_CODE (*decompfunc)(utils::bit2x4 * bit2ptr, utils::bit1x8 * bit1ptr, int8_t * zpptr, int8_t * dstptr, + typedef BTLA_CODE (*decompfunc)(utils::bit2x4* bit2ptr, utils::bit1x8* bit1ptr, int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int8_t* tmp, size_t tmpsize); decompfunc func = nullptr; diff --git a/bestla/bestla/kernel_avx512f.h b/bestla/bestla/kernel_avx512f.h index 41c82bbcf..2695729a0 100644 --- a/bestla/bestla/kernel_avx512f.h +++ b/bestla/bestla/kernel_avx512f.h @@ -2594,7 +2594,7 @@ template inline BTLA_CODE decompress_kblock_s4_s8(utils::int4x2* srcptr, int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int col, int8_t* tmp, size_t tmpsize) { if (zpptr) { - typedef BTLA_CODE (*decompfunc)(utils::int4x2 * srcptr, int8_t * zpptr, int8_t * dstptr, int blocksize, int ldzp, + typedef BTLA_CODE (*decompfunc)(utils::int4x2* srcptr, int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int8_t* tmp, size_t tmpsize); decompfunc func = nullptr; if (col == NTILE) { @@ -2816,7 +2816,7 @@ static inline BTLA_CODE decompress_kblock_s2_s8(utils::bit2x4* bit2ptr, int8_t* int ldzp, int n_offset, int k_offset, int row, int col, int8_t* tmp, size_t tmpsize) { if (zpptr) { - typedef BTLA_CODE (*decompfunc)(utils::bit2x4 * srcptr, int8_t * zpptr, int8_t * dstptr, int blocksize, int ldzp, + typedef BTLA_CODE (*decompfunc)(utils::bit2x4* srcptr, int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int8_t* tmp, size_t tmpsize); decompfunc func = nullptr; if (col == NTILE) { @@ -3074,7 +3074,7 @@ static inline BTLA_CODE decompress_kblock_s3_s8(utils::bit2x4* bit2ptr, utils::b int8_t* dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int col, int8_t* tmp, size_t tmpsize) { if (zpptr) { - typedef BTLA_CODE (*decompfunc)(utils::bit2x4 * bit2ptr, utils::bit1x8 * bit1ptr, int8_t * zpptr, int8_t * dstptr, + typedef BTLA_CODE (*decompfunc)(utils::bit2x4* bit2ptr, utils::bit1x8* bit1ptr, int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int8_t* tmp, size_t tmpsize); decompfunc func = nullptr; diff --git a/bestla/bestla/xbyak/xbyak.h b/bestla/bestla/xbyak/xbyak.h index fab31919d..a03a0897b 100644 --- a/bestla/bestla/xbyak/xbyak.h +++ b/bestla/bestla/xbyak/xbyak.h @@ -45,7 +45,7 @@ #endif #ifdef __GNUC__ -#define XBYAK_GNUC_PREREQ(major, minor) ((__GNUC__)*100 + (__GNUC_MINOR__) >= (major)*100 + (minor)) +#define XBYAK_GNUC_PREREQ(major, minor) ((__GNUC__) * 100 + (__GNUC_MINOR__) >= (major) * 100 + (minor)) #else #define XBYAK_GNUC_PREREQ(major, minor) 0 #endif @@ -191,7 +191,7 @@ typedef uint8_t uint8; #endif #endif #ifndef MIE_PACK // for shufps -#define MIE_PACK(x, y, z, w) ((x)*64 + (y)*16 + (z)*4 + (w)) +#define MIE_PACK(x, y, z, w) ((x) * 64 + (y) * 16 + (z) * 4 + (w)) #endif enum { diff --git a/neural_speed/core/ne_layers.c b/neural_speed/core/ne_layers.c index 5e22da792..a6c326378 100644 --- a/neural_speed/core/ne_layers.c +++ b/neural_speed/core/ne_layers.c @@ -1351,13 +1351,9 @@ struct ne_tensor* ne_debug_op(struct ne_context* ctx, struct ne_tensor* a, ne_de return result; } -struct ne_tensor* ne_dup(struct ne_context* ctx, struct ne_tensor* a) { - return ne_dup_impl(ctx, a, false); -} +struct ne_tensor* ne_dup(struct ne_context* ctx, struct ne_tensor* a) { return ne_dup_impl(ctx, a, false); } -struct ne_tensor* ne_dup_inplace(struct ne_context* ctx, struct ne_tensor* a) { - return ne_dup_impl(ctx, a, true); -} +struct ne_tensor* ne_dup_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_dup_impl(ctx, a, true); } // ne_add @@ -1722,13 +1718,9 @@ struct ne_tensor* ne_sqr_impl(struct ne_context* ctx, struct ne_tensor* a, bool return result; } -struct ne_tensor* ne_sqr(struct ne_context* ctx, struct ne_tensor* a) { - return ne_sqr_impl(ctx, a, false); -} +struct ne_tensor* ne_sqr(struct ne_context* ctx, struct ne_tensor* a) { return ne_sqr_impl(ctx, a, false); } -struct ne_tensor* ne_sqr_inplace(struct ne_context* ctx, struct ne_tensor* a) { - return ne_sqr_impl(ctx, a, true); -} +struct ne_tensor* ne_sqr_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_sqr_impl(ctx, a, true); } // ne_sqrt @@ -1749,13 +1741,9 @@ struct ne_tensor* ne_sqrt_impl(struct ne_context* ctx, struct ne_tensor* a, bool return result; } -struct ne_tensor* ne_sqrt(struct ne_context* ctx, struct ne_tensor* a) { - return ne_sqrt_impl(ctx, a, false); -} +struct ne_tensor* ne_sqrt(struct ne_context* ctx, struct ne_tensor* a) { return ne_sqrt_impl(ctx, a, false); } -struct ne_tensor* ne_sqrt_inplace(struct ne_context* ctx, struct ne_tensor* a) { - return ne_sqrt_impl(ctx, a, true); -} +struct ne_tensor* ne_sqrt_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_sqrt_impl(ctx, a, true); } // ne_log @@ -1776,13 +1764,9 @@ struct ne_tensor* ne_log_impl(struct ne_context* ctx, struct ne_tensor* a, bool return result; } -struct ne_tensor* ne_log(struct ne_context* ctx, struct ne_tensor* a) { - return ne_log_impl(ctx, a, false); -} +struct ne_tensor* ne_log(struct ne_context* ctx, struct ne_tensor* a) { return ne_log_impl(ctx, a, false); } -struct ne_tensor* ne_log_inplace(struct ne_context* ctx, struct ne_tensor* a) { - return ne_log_impl(ctx, a, true); -} +struct ne_tensor* ne_log_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_log_impl(ctx, a, true); } // ne_sum @@ -1892,13 +1876,9 @@ struct ne_tensor* ne_abs_impl(struct ne_context* ctx, struct ne_tensor* a, bool return result; } -struct ne_tensor* ne_abs(struct ne_context* ctx, struct ne_tensor* a) { - return ne_abs_impl(ctx, a, false); -} +struct ne_tensor* ne_abs(struct ne_context* ctx, struct ne_tensor* a) { return ne_abs_impl(ctx, a, false); } -struct ne_tensor* ne_abs_inplace(struct ne_context* ctx, struct ne_tensor* a) { - return ne_abs_impl(ctx, a, true); -} +struct ne_tensor* ne_abs_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_abs_impl(ctx, a, true); } // ne_sgn @@ -1919,13 +1899,9 @@ struct ne_tensor* ne_sgn_impl(struct ne_context* ctx, struct ne_tensor* a, bool return result; } -struct ne_tensor* ne_sgn(struct ne_context* ctx, struct ne_tensor* a) { - return ne_sgn_impl(ctx, a, false); -} +struct ne_tensor* ne_sgn(struct ne_context* ctx, struct ne_tensor* a) { return ne_sgn_impl(ctx, a, false); } -struct ne_tensor* ne_sgn_inplace(struct ne_context* ctx, struct ne_tensor* a) { - return ne_sgn_impl(ctx, a, true); -} +struct ne_tensor* ne_sgn_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_sgn_impl(ctx, a, true); } // ne_neg @@ -1946,13 +1922,9 @@ struct ne_tensor* ne_neg_impl(struct ne_context* ctx, struct ne_tensor* a, bool return result; } -struct ne_tensor* ne_neg(struct ne_context* ctx, struct ne_tensor* a) { - return ne_neg_impl(ctx, a, false); -} +struct ne_tensor* ne_neg(struct ne_context* ctx, struct ne_tensor* a) { return ne_neg_impl(ctx, a, false); } -struct ne_tensor* ne_neg_inplace(struct ne_context* ctx, struct ne_tensor* a) { - return ne_neg_impl(ctx, a, true); -} +struct ne_tensor* ne_neg_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_neg_impl(ctx, a, true); } // ne_step @@ -1973,13 +1945,9 @@ struct ne_tensor* ne_step_impl(struct ne_context* ctx, struct ne_tensor* a, bool return result; } -struct ne_tensor* ne_step(struct ne_context* ctx, struct ne_tensor* a) { - return ne_step_impl(ctx, a, false); -} +struct ne_tensor* ne_step(struct ne_context* ctx, struct ne_tensor* a) { return ne_step_impl(ctx, a, false); } -struct ne_tensor* ne_step_inplace(struct ne_context* ctx, struct ne_tensor* a) { - return ne_step_impl(ctx, a, true); -} +struct ne_tensor* ne_step_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_step_impl(ctx, a, true); } // ne_relu @@ -2000,13 +1968,9 @@ struct ne_tensor* ne_relu_impl(struct ne_context* ctx, struct ne_tensor* a, bool return result; } -struct ne_tensor* ne_relu(struct ne_context* ctx, struct ne_tensor* a) { - return ne_relu_impl(ctx, a, false); -} +struct ne_tensor* ne_relu(struct ne_context* ctx, struct ne_tensor* a) { return ne_relu_impl(ctx, a, false); } -struct ne_tensor* ne_relu_inplace(struct ne_context* ctx, struct ne_tensor* a) { - return ne_relu_impl(ctx, a, true); -} +struct ne_tensor* ne_relu_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_relu_impl(ctx, a, true); } // ne_gelu @@ -2027,13 +1991,9 @@ struct ne_tensor* ne_gelu_impl(struct ne_context* ctx, struct ne_tensor* a, bool return result; } -struct ne_tensor* ne_gelu(struct ne_context* ctx, struct ne_tensor* a) { - return ne_gelu_impl(ctx, a, false); -} +struct ne_tensor* ne_gelu(struct ne_context* ctx, struct ne_tensor* a) { return ne_gelu_impl(ctx, a, false); } -struct ne_tensor* ne_gelu_inplace(struct ne_context* ctx, struct ne_tensor* a) { - return ne_gelu_impl(ctx, a, true); -} +struct ne_tensor* ne_gelu_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_gelu_impl(ctx, a, true); } // ne_silu @@ -2054,13 +2014,9 @@ struct ne_tensor* ne_silu_impl(struct ne_context* ctx, struct ne_tensor* a, bool return result; } -struct ne_tensor* ne_silu(struct ne_context* ctx, struct ne_tensor* a) { - return ne_silu_impl(ctx, a, false); -} +struct ne_tensor* ne_silu(struct ne_context* ctx, struct ne_tensor* a) { return ne_silu_impl(ctx, a, false); } -struct ne_tensor* ne_silu_inplace(struct ne_context* ctx, struct ne_tensor* a) { - return ne_silu_impl(ctx, a, true); -} +struct ne_tensor* ne_silu_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_silu_impl(ctx, a, true); } // ne_silu_back @@ -2630,13 +2586,9 @@ struct ne_tensor* ne_cont_impl(struct ne_context* ctx, struct ne_tensor* a, bool return result; } -struct ne_tensor* ne_cont(struct ne_context* ctx, struct ne_tensor* a) { - return ne_cont_impl(ctx, a, false); -} +struct ne_tensor* ne_cont(struct ne_context* ctx, struct ne_tensor* a) { return ne_cont_impl(ctx, a, false); } -struct ne_tensor* ne_cont_inplace(struct ne_context* ctx, struct ne_tensor* a) { - return ne_cont_impl(ctx, a, true); -} +struct ne_tensor* ne_cont_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_cont_impl(ctx, a, true); } // ne_reshape @@ -3188,9 +3140,7 @@ struct ne_tensor* ne_soft_max_impl(struct ne_context* ctx, struct ne_tensor* a, return result; } -struct ne_tensor* ne_soft_max(struct ne_context* ctx, struct ne_tensor* a) { - return ne_soft_max_impl(ctx, a, false); -} +struct ne_tensor* ne_soft_max(struct ne_context* ctx, struct ne_tensor* a) { return ne_soft_max_impl(ctx, a, false); } struct ne_tensor* ne_soft_max_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_soft_max_impl(ctx, a, true); @@ -7133,7 +7083,7 @@ static void ne_compute_forward_mul_mat_id_q_f32(const struct ne_compute_params* // char * wdata_src1_end = (char *)params->wdata; // int64_t wdata_src1_end = 0; -#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] +#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id) * ne11 + (i1)] // nb01 >= nb00 - src0 is not transposed // compute by src0 rows @@ -7295,7 +7245,7 @@ static void ne_compute_forward_mul_mat_id_f32(const struct ne_compute_params* pa } int64_t matrix_row_counts[100]; // [n_as] int64_t matrix_rows[30000]; // [n_as][ne11] -#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] +#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id) * ne11 + (i1)] memset(matrix_row_counts, 0, n_as * sizeof(int64_t)); memset(matrix_rows, -1, 30000 * sizeof(int64_t)); for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { @@ -7443,7 +7393,7 @@ static void ne_compute_forward_mul_mat_id_f16_f32(const struct ne_compute_params } int64_t matrix_row_counts[100]; // [n_as] int64_t matrix_rows[30000]; // [n_as][ne11] -#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] +#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id) * ne11 + (i1)] memset(matrix_row_counts, 0, n_as * sizeof(int64_t)); memset(matrix_rows, -1, 30000 * sizeof(int64_t)); for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { @@ -7572,7 +7522,7 @@ static void ne_compute_forward_mul_mat_id_q_f32_bestla(const struct ne_compute_p // int64_t wdata_src1_end = 0; int64_t matrix_row_counts[100]; // [n_as] int64_t matrix_rows[30000]; // [n_as][ne11] -#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] +#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id) * ne11 + (i1)] // nb01 >= nb00 - src0 is not transposed // compute by src0 rows diff --git a/neural_speed/models/stablelm/stablelm.cpp b/neural_speed/models/stablelm/stablelm.cpp index 04a64d56d..20c843b37 100644 --- a/neural_speed/models/stablelm/stablelm.cpp +++ b/neural_speed/models/stablelm/stablelm.cpp @@ -228,10 +228,10 @@ static bool stablelm_model_eval_internal(model_context* ctx, const model_input* struct ne_tensor* Q = ne_permute(ctx0, ne_reshape_4d(ctx0, Qcur, head_dim, n_head, N, batch_size), 0, 2, 1, 3); ne_set_name(Q, "Q"); // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) - struct ne_tensor* K = - ne_view_4d(ctx0, kv_self.k, head_dim, n_past + N, n_head_kv, batch_size, ne_element_size(kv_self.k) * head_dim, - ne_element_size(kv_self.k) * head_dim * n_ctx, ne_element_size(kv_self.k) * n_embd * n_ctx, - il * n_ctx * ne_element_size(kv_self.k) * n_embd * kv_n_ctx_block); + struct ne_tensor* K = ne_view_4d( + ctx0, kv_self.k, head_dim, n_past + N, n_head_kv, batch_size, ne_element_size(kv_self.k) * head_dim, + ne_element_size(kv_self.k) * head_dim * n_ctx, ne_element_size(kv_self.k) * n_embd * n_ctx, + il * n_ctx * ne_element_size(kv_self.k) * n_embd * kv_n_ctx_block); ne_set_name(K, "K"); // K * Q struct ne_tensor* KQ = ne_mul_mat(ctx0, K, Q); @@ -267,15 +267,15 @@ static bool stablelm_model_eval_internal(model_context* ctx, const model_input* // store key and value to memory { - const auto k_cache = ne_view_3d(ctx0, kv_self.k, // tensor - head_dim, n_ctx, n_head_kv, // ne - 0, 0, // nb (bestla managed) - il * k_size); // offset + const auto k_cache = ne_view_3d(ctx0, kv_self.k, // tensor + head_dim, n_ctx, n_head_kv, // ne + 0, 0, // nb (bestla managed) + il * k_size); // offset ne_build_forward_expand(&gf, ne_flash_attn_update_k(ctx0, k_cache, Kcur, n_past, false)); - const auto v_cache = ne_view_3d(ctx0, kv_self.v, // tensor - head_dim, n_ctx, n_head_kv, // ne - 0, 0, // nb (bestla managed) - il * v_size); // offset + const auto v_cache = ne_view_3d(ctx0, kv_self.v, // tensor + head_dim, n_ctx, n_head_kv, // ne + 0, 0, // nb (bestla managed) + il * v_size); // offset ne_build_forward_expand(&gf, ne_flash_attn_update_v(ctx0, v_cache, Vcur, n_past, false)); } @@ -304,12 +304,10 @@ static bool stablelm_model_eval_internal(model_context* ctx, const model_input* } // out projection gemm - { - cur = ne_mul_mat(ctx0, model.layers[il].attn[3], cur); - } + { cur = ne_mul_mat(ctx0, model.layers[il].attn[3], cur); } } lctx.use_buf(ctx0, 1); - + cur = ne_add(ctx0, cur, inpL); inpL = cur; diff --git a/neural_speed/models/stablelm/stablelm.h b/neural_speed/models/stablelm/stablelm.h index 9cfe4f102..8309a6ff4 100644 --- a/neural_speed/models/stablelm/stablelm.h +++ b/neural_speed/models/stablelm/stablelm.h @@ -27,19 +27,19 @@ enum stablelm_model { static const model_scratch stablelm_mem_req(int n_layers, float scratch_size_ratio = 1.0f) { switch (n_layers) { - case 24: // StableLM-2-1.6B & StableLM-2-Zephyr-1.6B + case 24: // StableLM-2-1.6B & StableLM-2-Zephyr-1.6B return { static_cast(scratch_size_ratio * 512) * MB, static_cast(scratch_size_ratio * 512) * MB, static_cast(scratch_size_ratio * 1024) * MB, }; - case 32: // StableLM-3B & Stable-Code-3B + case 32: // StableLM-3B & Stable-Code-3B return { static_cast(scratch_size_ratio * 1024) * MB, static_cast(scratch_size_ratio * 1024) * MB, static_cast(scratch_size_ratio * 1024) * MB, }; - case 40: // StableLM-2-12B + case 40: // StableLM-2-12B return { static_cast(scratch_size_ratio * 2560) * MB, static_cast(scratch_size_ratio * 2560) * MB, diff --git a/neural_speed/models/stablelm/stablelm_utils.cpp b/neural_speed/models/stablelm/stablelm_utils.cpp index 6c258db56..7fbf709c1 100644 --- a/neural_speed/models/stablelm/stablelm_utils.cpp +++ b/neural_speed/models/stablelm/stablelm_utils.cpp @@ -145,7 +145,7 @@ void stablelm::load(model_context* ctx, model_progress_callback progress_callbac } else if (n_layer == 40) { // StableLM-2-12B layer.attn[4] = ml->get_tensor(layers_i + ".self_attn.q_layernorm.weight", {n_embd_head_k, n_head}, backend); layer.attn[5] = ml->get_tensor(layers_i + ".self_attn.k_layernorm.weight", {n_embd_head_k, n_head_kv}, backend); - } + } // Post Attention norm - Only present in 1.6B & 3B if (n_layer < 40) { @@ -165,12 +165,12 @@ void stablelm::load(model_context* ctx, model_progress_callback progress_callbac ne_nbytes(layer.attn[2]) + ne_nbytes(layer.attn[3]) + ne_nbytes(layer.attn[4]) + ne_nbytes(layer.attn[5]) + ne_nbytes(layer.attn[6]) + ne_nbytes(layer.ffn[0]) + ne_nbytes(layer.ffn[1]) + ne_nbytes(layer.ffn[2]); - } else if (n_layer == 32) { + } else if (n_layer == 32) { vram_total += ne_nbytes(layer.norm[0]) + ne_nbytes(layer.norm[1]) + ne_nbytes(layer.norm[2]) + ne_nbytes(layer.norm[3]) + ne_nbytes(layer.attn[0]) + ne_nbytes(layer.attn[1]) + ne_nbytes(layer.attn[2]) + ne_nbytes(layer.attn[3]) + ne_nbytes(layer.ffn[0]) + ne_nbytes(layer.ffn[1]) + ne_nbytes(layer.ffn[2]); - } else if (n_layer == 40) { + } else if (n_layer == 40) { vram_total += ne_nbytes(layer.norm[0]) + ne_nbytes(layer.norm[1]) + ne_nbytes(layer.attn[0]) + ne_nbytes(layer.attn[1]) + ne_nbytes(layer.attn[2]) + ne_nbytes(layer.attn[3]) + ne_nbytes(layer.attn[4]) + ne_nbytes(layer.attn[5]) + ne_nbytes(layer.ffn[0]) + @@ -205,7 +205,8 @@ void stablelm::load(model_context* ctx, model_progress_callback progress_callbac class stablelm_quant_layer : public quant_layer_base { public: quant_params_internal get_layer_config(std::string layername, std::vector ne, ne_type type) override { - bool quantize = (layername.rfind("weight") == layername.size() - 6) && (layername.find("layernorm") == std::string::npos); // ends with 'weight'? + bool quantize = (layername.rfind("weight") == layername.size() - 6) && + (layername.find("layernorm") == std::string::npos); // ends with 'weight'? if (layername == "model.embed_tokens.weight") { // special layer process, can be loaded by config file return quant_params_internal(); // return q4_0 to cover the usage of getrow From 5a08cfc20cff252801e558aebb92cbf83755df7d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 10 May 2024 18:44:15 +0000 Subject: [PATCH 11/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_speed/convert/convert_stablelm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neural_speed/convert/convert_stablelm.py b/neural_speed/convert/convert_stablelm.py index cdc3caa2c..8340aecee 100644 --- a/neural_speed/convert/convert_stablelm.py +++ b/neural_speed/convert/convert_stablelm.py @@ -145,7 +145,7 @@ def stablelm_convert(model, tokenizer, dir_model, fname_out, ftype, hparams): fout.write(struct.pack("i", len(text))) fout.write(text) fout.write(struct.pack("f", -10000)) - + def write_header(name, data, ftype=0): str = name.encode('utf-8') n_dims = len(data.shape) @@ -165,7 +165,7 @@ def write_header(name, data, ftype=0): # Skip some tensors if name.endswith((".attention.rotary_emb.inv_freq")): continue - + data = data_torch.squeeze().numpy() old_dtype = data.dtype n_dims = len(data.shape) From 223e3756986a540ecb0c2a07c773253ae0cd9de7 Mon Sep 17 00:00:00 2001 From: Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> Date: Tue, 14 May 2024 12:29:00 +0200 Subject: [PATCH 12/14] Fix clang formatting issue --- bestla/bestla/kernel_avx2.h | 6 +- bestla/bestla/kernel_avx512f.h | 6 +- bestla/bestla/xbyak/xbyak.h | 4 +- neural_speed/core/ne_layers.c | 108 ++++++++++++++++++++++++--------- 4 files changed, 87 insertions(+), 37 deletions(-) diff --git a/bestla/bestla/kernel_avx2.h b/bestla/bestla/kernel_avx2.h index f3ec4b50f..8856010b6 100644 --- a/bestla/bestla/kernel_avx2.h +++ b/bestla/bestla/kernel_avx2.h @@ -690,7 +690,7 @@ template inline BTLA_CODE decompress_kblock_s4_s8(utils::int4x2* srcptr, int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int col, int8_t* tmp, size_t tmpsize) { if (zpptr) { - typedef BTLA_CODE (*decompfunc)(utils::int4x2* srcptr, int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp, + typedef BTLA_CODE (*decompfunc)(utils::int4x2 * srcptr, int8_t * zpptr, int8_t * dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int8_t* tmp, size_t tmpsize); decompfunc func = nullptr; if (col == NTILE) { @@ -763,7 +763,7 @@ static inline BTLA_CODE decompress_kblock_s2_s8(utils::bit2x4* bit2ptr, int8_t* int ldzp, int n_offset, int k_offset, int row, int col, int8_t* tmp, size_t tmpsize) { if (zpptr) { - typedef BTLA_CODE (*decompfunc)(utils::bit2x4* srcptr, int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp, + typedef BTLA_CODE (*decompfunc)(utils::bit2x4 * srcptr, int8_t * zpptr, int8_t * dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int8_t* tmp, size_t tmpsize); decompfunc func = nullptr; if (col == NTILE) { @@ -1021,7 +1021,7 @@ static inline BTLA_CODE decompress_kblock_s3_s8(utils::bit2x4* bit2ptr, utils::b int8_t* dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int col, int8_t* tmp, size_t tmpsize) { if (zpptr) { - typedef BTLA_CODE (*decompfunc)(utils::bit2x4* bit2ptr, utils::bit1x8* bit1ptr, int8_t* zpptr, int8_t* dstptr, + typedef BTLA_CODE (*decompfunc)(utils::bit2x4 * bit2ptr, utils::bit1x8 * bit1ptr, int8_t * zpptr, int8_t * dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int8_t* tmp, size_t tmpsize); decompfunc func = nullptr; diff --git a/bestla/bestla/kernel_avx512f.h b/bestla/bestla/kernel_avx512f.h index 2695729a0..41c82bbcf 100644 --- a/bestla/bestla/kernel_avx512f.h +++ b/bestla/bestla/kernel_avx512f.h @@ -2594,7 +2594,7 @@ template inline BTLA_CODE decompress_kblock_s4_s8(utils::int4x2* srcptr, int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int col, int8_t* tmp, size_t tmpsize) { if (zpptr) { - typedef BTLA_CODE (*decompfunc)(utils::int4x2* srcptr, int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp, + typedef BTLA_CODE (*decompfunc)(utils::int4x2 * srcptr, int8_t * zpptr, int8_t * dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int8_t* tmp, size_t tmpsize); decompfunc func = nullptr; if (col == NTILE) { @@ -2816,7 +2816,7 @@ static inline BTLA_CODE decompress_kblock_s2_s8(utils::bit2x4* bit2ptr, int8_t* int ldzp, int n_offset, int k_offset, int row, int col, int8_t* tmp, size_t tmpsize) { if (zpptr) { - typedef BTLA_CODE (*decompfunc)(utils::bit2x4* srcptr, int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp, + typedef BTLA_CODE (*decompfunc)(utils::bit2x4 * srcptr, int8_t * zpptr, int8_t * dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int8_t* tmp, size_t tmpsize); decompfunc func = nullptr; if (col == NTILE) { @@ -3074,7 +3074,7 @@ static inline BTLA_CODE decompress_kblock_s3_s8(utils::bit2x4* bit2ptr, utils::b int8_t* dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int col, int8_t* tmp, size_t tmpsize) { if (zpptr) { - typedef BTLA_CODE (*decompfunc)(utils::bit2x4* bit2ptr, utils::bit1x8* bit1ptr, int8_t* zpptr, int8_t* dstptr, + typedef BTLA_CODE (*decompfunc)(utils::bit2x4 * bit2ptr, utils::bit1x8 * bit1ptr, int8_t * zpptr, int8_t * dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int8_t* tmp, size_t tmpsize); decompfunc func = nullptr; diff --git a/bestla/bestla/xbyak/xbyak.h b/bestla/bestla/xbyak/xbyak.h index a03a0897b..fab31919d 100644 --- a/bestla/bestla/xbyak/xbyak.h +++ b/bestla/bestla/xbyak/xbyak.h @@ -45,7 +45,7 @@ #endif #ifdef __GNUC__ -#define XBYAK_GNUC_PREREQ(major, minor) ((__GNUC__) * 100 + (__GNUC_MINOR__) >= (major) * 100 + (minor)) +#define XBYAK_GNUC_PREREQ(major, minor) ((__GNUC__)*100 + (__GNUC_MINOR__) >= (major)*100 + (minor)) #else #define XBYAK_GNUC_PREREQ(major, minor) 0 #endif @@ -191,7 +191,7 @@ typedef uint8_t uint8; #endif #endif #ifndef MIE_PACK // for shufps -#define MIE_PACK(x, y, z, w) ((x) * 64 + (y) * 16 + (z) * 4 + (w)) +#define MIE_PACK(x, y, z, w) ((x)*64 + (y)*16 + (z)*4 + (w)) #endif enum { diff --git a/neural_speed/core/ne_layers.c b/neural_speed/core/ne_layers.c index a6c326378..5e22da792 100644 --- a/neural_speed/core/ne_layers.c +++ b/neural_speed/core/ne_layers.c @@ -1351,9 +1351,13 @@ struct ne_tensor* ne_debug_op(struct ne_context* ctx, struct ne_tensor* a, ne_de return result; } -struct ne_tensor* ne_dup(struct ne_context* ctx, struct ne_tensor* a) { return ne_dup_impl(ctx, a, false); } +struct ne_tensor* ne_dup(struct ne_context* ctx, struct ne_tensor* a) { + return ne_dup_impl(ctx, a, false); +} -struct ne_tensor* ne_dup_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_dup_impl(ctx, a, true); } +struct ne_tensor* ne_dup_inplace(struct ne_context* ctx, struct ne_tensor* a) { + return ne_dup_impl(ctx, a, true); +} // ne_add @@ -1718,9 +1722,13 @@ struct ne_tensor* ne_sqr_impl(struct ne_context* ctx, struct ne_tensor* a, bool return result; } -struct ne_tensor* ne_sqr(struct ne_context* ctx, struct ne_tensor* a) { return ne_sqr_impl(ctx, a, false); } +struct ne_tensor* ne_sqr(struct ne_context* ctx, struct ne_tensor* a) { + return ne_sqr_impl(ctx, a, false); +} -struct ne_tensor* ne_sqr_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_sqr_impl(ctx, a, true); } +struct ne_tensor* ne_sqr_inplace(struct ne_context* ctx, struct ne_tensor* a) { + return ne_sqr_impl(ctx, a, true); +} // ne_sqrt @@ -1741,9 +1749,13 @@ struct ne_tensor* ne_sqrt_impl(struct ne_context* ctx, struct ne_tensor* a, bool return result; } -struct ne_tensor* ne_sqrt(struct ne_context* ctx, struct ne_tensor* a) { return ne_sqrt_impl(ctx, a, false); } +struct ne_tensor* ne_sqrt(struct ne_context* ctx, struct ne_tensor* a) { + return ne_sqrt_impl(ctx, a, false); +} -struct ne_tensor* ne_sqrt_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_sqrt_impl(ctx, a, true); } +struct ne_tensor* ne_sqrt_inplace(struct ne_context* ctx, struct ne_tensor* a) { + return ne_sqrt_impl(ctx, a, true); +} // ne_log @@ -1764,9 +1776,13 @@ struct ne_tensor* ne_log_impl(struct ne_context* ctx, struct ne_tensor* a, bool return result; } -struct ne_tensor* ne_log(struct ne_context* ctx, struct ne_tensor* a) { return ne_log_impl(ctx, a, false); } +struct ne_tensor* ne_log(struct ne_context* ctx, struct ne_tensor* a) { + return ne_log_impl(ctx, a, false); +} -struct ne_tensor* ne_log_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_log_impl(ctx, a, true); } +struct ne_tensor* ne_log_inplace(struct ne_context* ctx, struct ne_tensor* a) { + return ne_log_impl(ctx, a, true); +} // ne_sum @@ -1876,9 +1892,13 @@ struct ne_tensor* ne_abs_impl(struct ne_context* ctx, struct ne_tensor* a, bool return result; } -struct ne_tensor* ne_abs(struct ne_context* ctx, struct ne_tensor* a) { return ne_abs_impl(ctx, a, false); } +struct ne_tensor* ne_abs(struct ne_context* ctx, struct ne_tensor* a) { + return ne_abs_impl(ctx, a, false); +} -struct ne_tensor* ne_abs_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_abs_impl(ctx, a, true); } +struct ne_tensor* ne_abs_inplace(struct ne_context* ctx, struct ne_tensor* a) { + return ne_abs_impl(ctx, a, true); +} // ne_sgn @@ -1899,9 +1919,13 @@ struct ne_tensor* ne_sgn_impl(struct ne_context* ctx, struct ne_tensor* a, bool return result; } -struct ne_tensor* ne_sgn(struct ne_context* ctx, struct ne_tensor* a) { return ne_sgn_impl(ctx, a, false); } +struct ne_tensor* ne_sgn(struct ne_context* ctx, struct ne_tensor* a) { + return ne_sgn_impl(ctx, a, false); +} -struct ne_tensor* ne_sgn_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_sgn_impl(ctx, a, true); } +struct ne_tensor* ne_sgn_inplace(struct ne_context* ctx, struct ne_tensor* a) { + return ne_sgn_impl(ctx, a, true); +} // ne_neg @@ -1922,9 +1946,13 @@ struct ne_tensor* ne_neg_impl(struct ne_context* ctx, struct ne_tensor* a, bool return result; } -struct ne_tensor* ne_neg(struct ne_context* ctx, struct ne_tensor* a) { return ne_neg_impl(ctx, a, false); } +struct ne_tensor* ne_neg(struct ne_context* ctx, struct ne_tensor* a) { + return ne_neg_impl(ctx, a, false); +} -struct ne_tensor* ne_neg_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_neg_impl(ctx, a, true); } +struct ne_tensor* ne_neg_inplace(struct ne_context* ctx, struct ne_tensor* a) { + return ne_neg_impl(ctx, a, true); +} // ne_step @@ -1945,9 +1973,13 @@ struct ne_tensor* ne_step_impl(struct ne_context* ctx, struct ne_tensor* a, bool return result; } -struct ne_tensor* ne_step(struct ne_context* ctx, struct ne_tensor* a) { return ne_step_impl(ctx, a, false); } +struct ne_tensor* ne_step(struct ne_context* ctx, struct ne_tensor* a) { + return ne_step_impl(ctx, a, false); +} -struct ne_tensor* ne_step_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_step_impl(ctx, a, true); } +struct ne_tensor* ne_step_inplace(struct ne_context* ctx, struct ne_tensor* a) { + return ne_step_impl(ctx, a, true); +} // ne_relu @@ -1968,9 +2000,13 @@ struct ne_tensor* ne_relu_impl(struct ne_context* ctx, struct ne_tensor* a, bool return result; } -struct ne_tensor* ne_relu(struct ne_context* ctx, struct ne_tensor* a) { return ne_relu_impl(ctx, a, false); } +struct ne_tensor* ne_relu(struct ne_context* ctx, struct ne_tensor* a) { + return ne_relu_impl(ctx, a, false); +} -struct ne_tensor* ne_relu_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_relu_impl(ctx, a, true); } +struct ne_tensor* ne_relu_inplace(struct ne_context* ctx, struct ne_tensor* a) { + return ne_relu_impl(ctx, a, true); +} // ne_gelu @@ -1991,9 +2027,13 @@ struct ne_tensor* ne_gelu_impl(struct ne_context* ctx, struct ne_tensor* a, bool return result; } -struct ne_tensor* ne_gelu(struct ne_context* ctx, struct ne_tensor* a) { return ne_gelu_impl(ctx, a, false); } +struct ne_tensor* ne_gelu(struct ne_context* ctx, struct ne_tensor* a) { + return ne_gelu_impl(ctx, a, false); +} -struct ne_tensor* ne_gelu_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_gelu_impl(ctx, a, true); } +struct ne_tensor* ne_gelu_inplace(struct ne_context* ctx, struct ne_tensor* a) { + return ne_gelu_impl(ctx, a, true); +} // ne_silu @@ -2014,9 +2054,13 @@ struct ne_tensor* ne_silu_impl(struct ne_context* ctx, struct ne_tensor* a, bool return result; } -struct ne_tensor* ne_silu(struct ne_context* ctx, struct ne_tensor* a) { return ne_silu_impl(ctx, a, false); } +struct ne_tensor* ne_silu(struct ne_context* ctx, struct ne_tensor* a) { + return ne_silu_impl(ctx, a, false); +} -struct ne_tensor* ne_silu_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_silu_impl(ctx, a, true); } +struct ne_tensor* ne_silu_inplace(struct ne_context* ctx, struct ne_tensor* a) { + return ne_silu_impl(ctx, a, true); +} // ne_silu_back @@ -2586,9 +2630,13 @@ struct ne_tensor* ne_cont_impl(struct ne_context* ctx, struct ne_tensor* a, bool return result; } -struct ne_tensor* ne_cont(struct ne_context* ctx, struct ne_tensor* a) { return ne_cont_impl(ctx, a, false); } +struct ne_tensor* ne_cont(struct ne_context* ctx, struct ne_tensor* a) { + return ne_cont_impl(ctx, a, false); +} -struct ne_tensor* ne_cont_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_cont_impl(ctx, a, true); } +struct ne_tensor* ne_cont_inplace(struct ne_context* ctx, struct ne_tensor* a) { + return ne_cont_impl(ctx, a, true); +} // ne_reshape @@ -3140,7 +3188,9 @@ struct ne_tensor* ne_soft_max_impl(struct ne_context* ctx, struct ne_tensor* a, return result; } -struct ne_tensor* ne_soft_max(struct ne_context* ctx, struct ne_tensor* a) { return ne_soft_max_impl(ctx, a, false); } +struct ne_tensor* ne_soft_max(struct ne_context* ctx, struct ne_tensor* a) { + return ne_soft_max_impl(ctx, a, false); +} struct ne_tensor* ne_soft_max_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_soft_max_impl(ctx, a, true); @@ -7083,7 +7133,7 @@ static void ne_compute_forward_mul_mat_id_q_f32(const struct ne_compute_params* // char * wdata_src1_end = (char *)params->wdata; // int64_t wdata_src1_end = 0; -#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id) * ne11 + (i1)] +#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] // nb01 >= nb00 - src0 is not transposed // compute by src0 rows @@ -7245,7 +7295,7 @@ static void ne_compute_forward_mul_mat_id_f32(const struct ne_compute_params* pa } int64_t matrix_row_counts[100]; // [n_as] int64_t matrix_rows[30000]; // [n_as][ne11] -#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id) * ne11 + (i1)] +#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] memset(matrix_row_counts, 0, n_as * sizeof(int64_t)); memset(matrix_rows, -1, 30000 * sizeof(int64_t)); for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { @@ -7393,7 +7443,7 @@ static void ne_compute_forward_mul_mat_id_f16_f32(const struct ne_compute_params } int64_t matrix_row_counts[100]; // [n_as] int64_t matrix_rows[30000]; // [n_as][ne11] -#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id) * ne11 + (i1)] +#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] memset(matrix_row_counts, 0, n_as * sizeof(int64_t)); memset(matrix_rows, -1, 30000 * sizeof(int64_t)); for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { @@ -7522,7 +7572,7 @@ static void ne_compute_forward_mul_mat_id_q_f32_bestla(const struct ne_compute_p // int64_t wdata_src1_end = 0; int64_t matrix_row_counts[100]; // [n_as] int64_t matrix_rows[30000]; // [n_as][ne11] -#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id) * ne11 + (i1)] +#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] // nb01 >= nb00 - src0 is not transposed // compute by src0 rows From e251da21418dd867e81e3abe765c28a45a525374 Mon Sep 17 00:00:00 2001 From: Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> Date: Tue, 14 May 2024 12:35:41 +0200 Subject: [PATCH 13/14] Modify list of supported models --- docs/supported_models.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/docs/supported_models.md b/docs/supported_models.md index 2222af282..115bc9955 100644 --- a/docs/supported_models.md +++ b/docs/supported_models.md @@ -289,11 +289,9 @@ Neural Speed supports the following models: 128k - StableLM-3B, - StableLM-2-1_6B, - StableLM-2-Zephyr-1_6B, - StableLM-2-12B, - StableLM-2-12B-Chat + StableLM-2-1_6B, + StableLM-3B, + StableLM-2-12B ✅ From 3eb74ee068cc283e4626231fb987cd1088b41513 Mon Sep 17 00:00:00 2001 From: Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> Date: Tue, 14 May 2024 14:55:34 +0200 Subject: [PATCH 14/14] Minor fix --- neural_speed/models/stablelm/stablelm_utils.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/neural_speed/models/stablelm/stablelm_utils.cpp b/neural_speed/models/stablelm/stablelm_utils.cpp index 7fbf709c1..a7714cdb9 100644 --- a/neural_speed/models/stablelm/stablelm_utils.cpp +++ b/neural_speed/models/stablelm/stablelm_utils.cpp @@ -205,8 +205,9 @@ void stablelm::load(model_context* ctx, model_progress_callback progress_callbac class stablelm_quant_layer : public quant_layer_base { public: quant_params_internal get_layer_config(std::string layername, std::vector ne, ne_type type) override { - bool quantize = (layername.rfind("weight") == layername.size() - 6) && - (layername.find("layernorm") == std::string::npos); // ends with 'weight'? + bool quantize = + (layername.rfind("weight") == layername.size() - 6) && + (layername.find("layernorm") == std::string::npos); // quantize if ending with 'weight' && not a layernorm if (layername == "model.embed_tokens.weight") { // special layer process, can be loaded by config file return quant_params_internal(); // return q4_0 to cover the usage of getrow