Dan Johansson commited on
Commit
79fa7cf
·
unverified ·
1 Parent(s): fbe8350

whisper : add support for backends with multiple ggml_backend_buffer_type (#2863)

Browse files

* whisper : add support for ggml_backend_buffer_type

Signed-off-by: Dan Johansson <dan.johansson@arm.com>

* fix compile error when building on Ubuntu

Signed-off-by: Dan Johansson <dan.johansson@arm.com>

* remove copyright header from include file

Signed-off-by: Dan Johansson <dan.johansson@arm.com>

---------

Signed-off-by: Dan Johansson <dan.johansson@arm.com>

Files changed (3) hide show
  1. src/CMakeLists.txt +1 -0
  2. src/whisper-arch.h +141 -0
  3. src/whisper.cpp +240 -188
src/CMakeLists.txt CHANGED
@@ -102,6 +102,7 @@ endif()
102
 
103
  add_library(whisper
104
  ../include/whisper.h
 
105
  whisper.cpp
106
  )
107
 
 
102
 
103
  add_library(whisper
104
  ../include/whisper.h
105
+ whisper-arch.h
106
  whisper.cpp
107
  )
108
 
src/whisper-arch.h ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+
5
+ #include <map>
6
+
7
+ enum asr_tensor {
8
+ ASR_TENSOR_ENC_POS_EMBD,
9
+ ASR_TENSOR_DEC_POS_EMBD,
10
+ ASR_TENSOR_DEC_TOKEN_EMBD_WEIGHT,
11
+ ASR_TENSOR_LN_WEIGHT,
12
+ ASR_TENSOR_LN_BIAS,
13
+ ASR_TENSOR_CONV1_WEIGHT,
14
+ ASR_TENSOR_CONV1_BIAS,
15
+ ASR_TENSOR_CONV2_WEIGHT,
16
+ ASR_TENSOR_CONV2_BIAS,
17
+ ASR_TENSOR_LN_POST_WEIGHT,
18
+ ASR_TENSOR_LN_POST_BIAS,
19
+ ASR_TENSOR_MLP_LN_WEIGHT,
20
+ ASR_TENSOR_MLP_LN_BIAS,
21
+ ASR_TENSOR_MLP_0_WEIGHT,
22
+ ASR_TENSOR_MLP_0_BIAS,
23
+ ASR_TENSOR_MLP_2_WEIGHT,
24
+ ASR_TENSOR_MLP_2_BIAS,
25
+ ASR_TENSOR_ATTN_LN_WEIGHT,
26
+ ASR_TENSOR_ATTN_LN_BIAS,
27
+ ASR_TENSOR_ATTN_QUERY_WEIGHT,
28
+ ASR_TENSOR_ATTN_QUERY_BIAS,
29
+ ASR_TENSOR_ATTN_KEY_WEIGHT,
30
+ ASR_TENSOR_ATTN_VALUE_WEIGHT,
31
+ ASR_TENSOR_ATTN_VALUE_BIAS,
32
+ ASR_TENSOR_ATTN_OUT_WEIGHT,
33
+ ASR_TENSOR_ATTN_OUT_BIAS,
34
+ };
35
+
36
+ enum asr_system {
37
+ ASR_SYSTEM_ENCODER,
38
+ ASR_SYSTEM_DECODER,
39
+ ASR_SYSTEM_CROSS
40
+ };
41
+
42
+ static const std::map<asr_system, std::map<asr_tensor, const char *>> ASR_TENSOR_NAMES = {
43
+ {
44
+ ASR_SYSTEM_ENCODER,
45
+ {
46
+ {ASR_TENSOR_ENC_POS_EMBD, "encoder.positional_embedding"},
47
+ {ASR_TENSOR_CONV1_WEIGHT, "encoder.conv1.weight"},
48
+ {ASR_TENSOR_CONV1_BIAS, "encoder.conv1.bias"},
49
+ {ASR_TENSOR_CONV2_WEIGHT, "encoder.conv2.weight"},
50
+ {ASR_TENSOR_CONV2_BIAS, "encoder.conv2.bias"},
51
+ {ASR_TENSOR_LN_WEIGHT, "encoder.ln_post.weight"},
52
+ {ASR_TENSOR_LN_POST_BIAS, "encoder.ln_post.bias"},
53
+ {ASR_TENSOR_MLP_LN_WEIGHT, "encoder.blocks.%d.mlp_ln.weight"},
54
+ {ASR_TENSOR_MLP_LN_BIAS, "encoder.blocks.%d.mlp_ln.bias"},
55
+ {ASR_TENSOR_MLP_0_WEIGHT, "encoder.blocks.%d.mlp.0.weight"},
56
+ {ASR_TENSOR_MLP_0_BIAS, "encoder.blocks.%d.mlp.0.bias"},
57
+ {ASR_TENSOR_MLP_2_WEIGHT, "encoder.blocks.%d.mlp.2.weight"},
58
+ {ASR_TENSOR_MLP_2_BIAS, "encoder.blocks.%d.mlp.2.bias"},
59
+ {ASR_TENSOR_ATTN_LN_WEIGHT, "encoder.blocks.%d.attn_ln.weight"},
60
+ {ASR_TENSOR_ATTN_LN_BIAS, "encoder.blocks.%d.attn_ln.bias"},
61
+ {ASR_TENSOR_ATTN_QUERY_WEIGHT, "encoder.blocks.%d.attn.query.weight"},
62
+ {ASR_TENSOR_ATTN_QUERY_BIAS, "encoder.blocks.%d.attn.query.bias"},
63
+ {ASR_TENSOR_ATTN_KEY_WEIGHT, "encoder.blocks.%d.attn.key.weight"},
64
+ {ASR_TENSOR_ATTN_VALUE_WEIGHT, "encoder.blocks.%d.attn.value.weight"},
65
+ {ASR_TENSOR_ATTN_VALUE_BIAS, "encoder.blocks.%d.attn.value.bias"},
66
+ {ASR_TENSOR_ATTN_OUT_WEIGHT, "encoder.blocks.%d.attn.out.weight"},
67
+ {ASR_TENSOR_ATTN_OUT_BIAS, "encoder.blocks.%d.attn.out.bias"},
68
+ },
69
+ },
70
+ {
71
+ ASR_SYSTEM_DECODER,
72
+ {
73
+ {ASR_TENSOR_DEC_POS_EMBD, "decoder.positional_embedding"},
74
+ {ASR_TENSOR_DEC_TOKEN_EMBD_WEIGHT, "decoder.token_embedding.weight"},
75
+ {ASR_TENSOR_LN_WEIGHT, "decoder.ln.weight"},
76
+ {ASR_TENSOR_LN_BIAS, "decoder.ln.bias"},
77
+
78
+ {ASR_TENSOR_MLP_LN_WEIGHT, "decoder.blocks.%d.mlp_ln.weight"},
79
+ {ASR_TENSOR_MLP_LN_BIAS, "decoder.blocks.%d.mlp_ln.bias"},
80
+ {ASR_TENSOR_MLP_0_WEIGHT, "decoder.blocks.%d.mlp.0.weight"},
81
+ {ASR_TENSOR_MLP_0_BIAS, "decoder.blocks.%d.mlp.0.bias"},
82
+ {ASR_TENSOR_MLP_2_WEIGHT, "decoder.blocks.%d.mlp.2.weight"},
83
+ {ASR_TENSOR_MLP_2_BIAS, "decoder.blocks.%d.mlp.2.bias"},
84
+ {ASR_TENSOR_ATTN_LN_WEIGHT, "decoder.blocks.%d.attn_ln.weight"},
85
+ {ASR_TENSOR_ATTN_LN_BIAS, "decoder.blocks.%d.attn_ln.bias"},
86
+ {ASR_TENSOR_ATTN_QUERY_WEIGHT, "decoder.blocks.%d.attn.query.weight"},
87
+ {ASR_TENSOR_ATTN_QUERY_BIAS, "decoder.blocks.%d.attn.query.bias"},
88
+ {ASR_TENSOR_ATTN_KEY_WEIGHT, "decoder.blocks.%d.attn.key.weight"},
89
+ {ASR_TENSOR_ATTN_VALUE_WEIGHT, "decoder.blocks.%d.attn.value.weight"},
90
+ {ASR_TENSOR_ATTN_VALUE_BIAS, "decoder.blocks.%d.attn.value.bias"},
91
+ {ASR_TENSOR_ATTN_OUT_WEIGHT, "decoder.blocks.%d.attn.out.weight"},
92
+ {ASR_TENSOR_ATTN_OUT_BIAS, "decoder.blocks.%d.attn.out.bias"},
93
+ },
94
+ },
95
+ {
96
+ ASR_SYSTEM_CROSS,
97
+ {
98
+ {ASR_TENSOR_ATTN_LN_WEIGHT, "decoder.blocks.%d.cross_attn_ln.weight"},
99
+ {ASR_TENSOR_ATTN_LN_BIAS, "decoder.blocks.%d.cross_attn_ln.bias"},
100
+ {ASR_TENSOR_ATTN_QUERY_WEIGHT, "decoder.blocks.%d.cross_attn.query.weight"},
101
+ {ASR_TENSOR_ATTN_QUERY_BIAS, "decoder.blocks.%d.cross_attn.query.bias"},
102
+ {ASR_TENSOR_ATTN_KEY_WEIGHT, "decoder.blocks.%d.cross_attn.key.weight"},
103
+ {ASR_TENSOR_ATTN_VALUE_WEIGHT, "decoder.blocks.%d.cross_attn.value.weight"},
104
+ {ASR_TENSOR_ATTN_VALUE_BIAS, "decoder.blocks.%d.cross_attn.value.bias"},
105
+ {ASR_TENSOR_ATTN_OUT_WEIGHT, "decoder.blocks.%d.cross_attn.out.weight"},
106
+ {ASR_TENSOR_ATTN_OUT_BIAS, "decoder.blocks.%d.cross_attn.out.bias"},
107
+ },
108
+ },
109
+ };
110
+
111
+ static const std::map<asr_tensor, ggml_op> ASR_TENSOR_INFO = {
112
+ {ASR_TENSOR_ENC_POS_EMBD, GGML_OP_ADD},
113
+ {ASR_TENSOR_DEC_POS_EMBD, GGML_OP_GET_ROWS},
114
+ // Note: ASR_TENSOR_DEC_TOKEN_EMBD_WEIGHT is also used by GGML_OP_MAT_MUL. Need to figure out a way how to handle
115
+ // weight tensors that are used by multiple different operators when extra_buffer_type implementations accelerate
116
+ // more than just GGML_OP_MUL_MAT.
117
+ {ASR_TENSOR_DEC_TOKEN_EMBD_WEIGHT, GGML_OP_GET_ROWS},
118
+ {ASR_TENSOR_LN_WEIGHT, GGML_OP_MUL},
119
+ {ASR_TENSOR_LN_BIAS, GGML_OP_ADD},
120
+ {ASR_TENSOR_CONV1_WEIGHT, GGML_OP_IM2COL},
121
+ {ASR_TENSOR_CONV1_BIAS, GGML_OP_ADD},
122
+ {ASR_TENSOR_CONV2_WEIGHT, GGML_OP_IM2COL},
123
+ {ASR_TENSOR_CONV2_BIAS, GGML_OP_ADD},
124
+ {ASR_TENSOR_LN_POST_WEIGHT, GGML_OP_MUL},
125
+ {ASR_TENSOR_LN_POST_BIAS, GGML_OP_ADD},
126
+ {ASR_TENSOR_MLP_LN_WEIGHT, GGML_OP_MUL},
127
+ {ASR_TENSOR_MLP_LN_BIAS, GGML_OP_ADD},
128
+ {ASR_TENSOR_MLP_0_WEIGHT, GGML_OP_MUL_MAT},
129
+ {ASR_TENSOR_MLP_0_BIAS, GGML_OP_ADD},
130
+ {ASR_TENSOR_MLP_2_WEIGHT, GGML_OP_MUL_MAT},
131
+ {ASR_TENSOR_MLP_2_BIAS, GGML_OP_ADD},
132
+ {ASR_TENSOR_ATTN_LN_WEIGHT, GGML_OP_MUL},
133
+ {ASR_TENSOR_ATTN_LN_BIAS, GGML_OP_ADD},
134
+ {ASR_TENSOR_ATTN_QUERY_WEIGHT, GGML_OP_MUL_MAT},
135
+ {ASR_TENSOR_ATTN_QUERY_BIAS, GGML_OP_ADD},
136
+ {ASR_TENSOR_ATTN_KEY_WEIGHT, GGML_OP_MUL_MAT},
137
+ {ASR_TENSOR_ATTN_VALUE_WEIGHT, GGML_OP_MUL_MAT},
138
+ {ASR_TENSOR_ATTN_VALUE_BIAS, GGML_OP_ADD},
139
+ {ASR_TENSOR_ATTN_OUT_WEIGHT, GGML_OP_MUL_MAT},
140
+ {ASR_TENSOR_ATTN_OUT_BIAS, GGML_OP_ADD},
141
+ };
src/whisper.cpp CHANGED
@@ -1,4 +1,5 @@
1
  #include "whisper.h"
 
2
 
3
  #include "ggml.h"
4
  #include "ggml-cpp.h"
@@ -18,6 +19,7 @@
18
  #include <cassert>
19
  #define _USE_MATH_DEFINES
20
  #include <cmath>
 
21
  #include <codecvt>
22
  #include <cstdarg>
23
  #include <cstdio>
@@ -143,6 +145,21 @@ static void whisper_log_callback_default(ggml_log_level level, const char * text
143
  #define WHISPER_MAX_DECODERS 8
144
  #define WHISPER_MAX_NODES 4096
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  //
147
  // ggml helpers
148
  //
@@ -778,10 +795,10 @@ struct whisper_model {
778
  std::vector<whisper_layer_decoder> layers_decoder;
779
 
780
  // ggml context that contains all the meta information about the model tensors
781
- struct ggml_context * ctx = nullptr;
782
 
783
  // the model backend data is read-only and can be shared between processors
784
- ggml_backend_buffer_t buffer = nullptr;
785
 
786
  // tensors
787
  int n_loaded;
@@ -1364,28 +1381,109 @@ static std::vector<ggml_backend_t> whisper_backend_init(const whisper_context_pa
1364
  return result;
1365
  }
1366
 
1367
- static ggml_backend_buffer_type_t whisper_default_buffer_type(const whisper_context_params & params) {
1368
- ggml_backend_buffer_type_t result = ggml_backend_cpu_buffer_type();
1369
 
1370
- if (!params.use_gpu) {
1371
- return result;
1372
- }
1373
 
1374
- int cnt = 0;
1375
- for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
1376
- ggml_backend_dev_t dev = ggml_backend_dev_get(i);
1377
- if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
1378
- if (cnt == 0 || cnt == params.gpu_device) {
1379
- result = ggml_backend_dev_buffer_type(dev);
 
 
 
 
 
 
 
 
 
 
1380
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1381
 
1382
- if (++cnt > params.gpu_device) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1383
  break;
1384
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
1385
  }
1386
  }
1387
 
1388
- return result;
1389
  }
1390
 
1391
  // load the model from a ggml file
@@ -1594,31 +1692,65 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1594
  const ggml_type wtype = wctx.wtype;
1595
  const ggml_type vtype = wctx.wtype == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16; // conv type
1596
 
1597
- // create the ggml context
1598
- {
1599
- const auto & hparams = model.hparams;
1600
 
1601
- const int n_audio_layer = hparams.n_audio_layer;
1602
- const int n_text_layer = hparams.n_text_layer;
1603
 
1604
- const size_t n_tensors = 10 /* input */ + 15 + 15*n_audio_layer + 24*n_text_layer;
1605
 
1606
- struct ggml_init_params params = {
1607
- /*.mem_size =*/ n_tensors*ggml_tensor_overhead(),
1608
- /*.mem_buffer =*/ nullptr,
1609
- /*.no_alloc =*/ true,
1610
- };
 
 
 
 
1611
 
1612
- model.ctx = ggml_init(params);
1613
- if (!model.ctx) {
1614
- WHISPER_LOG_ERROR("%s: ggml_init() failed\n", __func__);
1615
- return false;
 
 
 
 
 
1616
  }
1617
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1618
 
1619
  // prepare tensors for the weights
1620
  {
1621
- auto & ctx = model.ctx;
 
 
 
 
 
 
1622
 
1623
  const auto & hparams = model.hparams;
1624
 
@@ -1638,189 +1770,108 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1638
  model.layers_decoder.resize(n_text_layer);
1639
 
1640
  // encoder
1641
- {
1642
- model.e_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_state, n_audio_ctx);
1643
-
1644
- model.e_conv_1_w = ggml_new_tensor_3d(ctx, vtype, 3, n_mels, n_audio_state);
1645
- model.e_conv_1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
1646
-
1647
- model.e_conv_2_w = ggml_new_tensor_3d(ctx, vtype, 3, n_audio_state, n_audio_state);
1648
- model.e_conv_2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
1649
-
1650
- model.e_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1651
- model.e_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1652
-
1653
- // map by name
1654
- model.tensors["encoder.positional_embedding"] = model.e_pe;
1655
-
1656
- model.tensors["encoder.conv1.weight"] = model.e_conv_1_w;
1657
- model.tensors["encoder.conv1.bias"] = model.e_conv_1_b;
1658
-
1659
- model.tensors["encoder.conv2.weight"] = model.e_conv_2_w;
1660
- model.tensors["encoder.conv2.bias"] = model.e_conv_2_b;
1661
 
1662
- model.tensors["encoder.ln_post.weight"] = model.e_ln_w;
1663
- model.tensors["encoder.ln_post.bias"] = model.e_ln_b;
1664
 
1665
- for (int i = 0; i < n_audio_layer; ++i) {
1666
- auto & layer = model.layers_encoder[i];
1667
 
1668
- layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1669
- layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1670
 
1671
- layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, 4*n_audio_state);
1672
- layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_audio_state);
1673
 
1674
- layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype, 4*n_audio_state, n_audio_state);
1675
- layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1676
 
1677
- layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1678
- layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1679
 
1680
- layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
1681
- layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1682
 
1683
- layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
 
1684
 
1685
- layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
1686
- layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1687
 
1688
- layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
1689
- layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1690
 
1691
- // map by name
1692
- model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.weight"] = layer.mlp_ln_w;
1693
- model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.bias"] = layer.mlp_ln_b;
1694
 
1695
- model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.0.weight"] = layer.mlp_0_w;
1696
- model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.0.bias"] = layer.mlp_0_b;
1697
-
1698
- model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.weight"] = layer.mlp_1_w;
1699
- model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.bias"] = layer.mlp_1_b;
1700
-
1701
- model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.weight"] = layer.attn_ln_0_w;
1702
- model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.bias"] = layer.attn_ln_0_b;
1703
-
1704
- model.tensors["encoder.blocks." + std::to_string(i) + ".attn.query.weight"] = layer.attn_q_w;
1705
- model.tensors["encoder.blocks." + std::to_string(i) + ".attn.query.bias"] = layer.attn_q_b;
1706
-
1707
- model.tensors["encoder.blocks." + std::to_string(i) + ".attn.key.weight"] = layer.attn_k_w;
1708
-
1709
- model.tensors["encoder.blocks." + std::to_string(i) + ".attn.value.weight"] = layer.attn_v_w;
1710
- model.tensors["encoder.blocks." + std::to_string(i) + ".attn.value.bias"] = layer.attn_v_b;
1711
-
1712
- model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.weight"] = layer.attn_ln_1_w;
1713
- model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.bias"] = layer.attn_ln_1_b;
1714
- }
1715
  }
1716
 
1717
  // decoder
1718
- {
1719
- model.d_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_text_state, n_text_ctx);
1720
-
1721
- model.d_te = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_vocab);
1722
-
1723
- model.d_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1724
- model.d_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1725
-
1726
- // map by name
1727
- model.tensors["decoder.positional_embedding"] = model.d_pe;
1728
-
1729
- model.tensors["decoder.token_embedding.weight"] = model.d_te;
1730
-
1731
- model.tensors["decoder.ln.weight"] = model.d_ln_w;
1732
- model.tensors["decoder.ln.bias"] = model.d_ln_b;
1733
-
1734
- for (int i = 0; i < n_text_layer; ++i) {
1735
- auto & layer = model.layers_decoder[i];
1736
-
1737
- layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1738
- layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1739
-
1740
- layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, 4*n_text_state);
1741
- layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_text_state);
1742
-
1743
- layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype, 4*n_text_state, n_text_state);
1744
- layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1745
-
1746
- layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1747
- layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1748
-
1749
- layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1750
- layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1751
-
1752
- layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1753
-
1754
- layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1755
- layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1756
 
1757
- layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1758
- layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1759
 
1760
- layer.cross_attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1761
- layer.cross_attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1762
 
1763
- layer.cross_attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1764
- layer.cross_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1765
 
1766
- layer.cross_attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
 
1767
 
1768
- layer.cross_attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1769
- layer.cross_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1770
 
1771
- layer.cross_attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1772
- layer.cross_attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1773
 
1774
- // map by name
1775
- model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.weight"] = layer.mlp_ln_w;
1776
- model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.bias"] = layer.mlp_ln_b;
1777
 
1778
- model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.0.weight"] = layer.mlp_0_w;
1779
- model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.0.bias"] = layer.mlp_0_b;
1780
 
1781
- model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.weight"] = layer.mlp_1_w;
1782
- model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.bias"] = layer.mlp_1_b;
1783
 
1784
- model.tensors["decoder.blocks." + std::to_string(i) + ".attn_ln.weight"] = layer.attn_ln_0_w;
1785
- model.tensors["decoder.blocks." + std::to_string(i) + ".attn_ln.bias"] = layer.attn_ln_0_b;
1786
 
1787
- model.tensors["decoder.blocks." + std::to_string(i) + ".attn.query.weight"] = layer.attn_q_w;
1788
- model.tensors["decoder.blocks." + std::to_string(i) + ".attn.query.bias"] = layer.attn_q_b;
1789
 
1790
- model.tensors["decoder.blocks." + std::to_string(i) + ".attn.key.weight"] = layer.attn_k_w;
 
1791
 
1792
- model.tensors["decoder.blocks." + std::to_string(i) + ".attn.value.weight"] = layer.attn_v_w;
1793
- model.tensors["decoder.blocks." + std::to_string(i) + ".attn.value.bias"] = layer.attn_v_b;
1794
 
1795
- model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.weight"] = layer.attn_ln_1_w;
1796
- model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.bias"] = layer.attn_ln_1_b;
1797
 
1798
- model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn_ln.weight"] = layer.cross_attn_ln_0_w;
1799
- model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn_ln.bias"] = layer.cross_attn_ln_0_b;
1800
 
1801
- model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.query.weight"] = layer.cross_attn_q_w;
1802
- model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.query.bias"] = layer.cross_attn_q_b;
1803
-
1804
- model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.key.weight"] = layer.cross_attn_k_w;
1805
-
1806
- model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.value.weight"] = layer.cross_attn_v_w;
1807
- model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.value.bias"] = layer.cross_attn_v_b;
1808
-
1809
- model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.weight"] = layer.cross_attn_ln_1_w;
1810
- model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.bias"] = layer.cross_attn_ln_1_b;
1811
- }
1812
  }
 
 
1813
  }
1814
 
1815
  // allocate tensors in the backend buffers
1816
- model.buffer = ggml_backend_alloc_ctx_tensors_from_buft(model.ctx, whisper_default_buffer_type(wctx.params));
1817
- if (!model.buffer) {
1818
- WHISPER_LOG_ERROR("%s: failed to allocate memory for the model\n", __func__);
1819
- return false;
1820
- }
 
1821
 
1822
- size_t size_main = ggml_backend_buffer_get_size(model.buffer);
1823
- WHISPER_LOG_INFO("%s: %8s total size = %8.2f MB\n", __func__, ggml_backend_buffer_name(model.buffer), size_main / 1e6);
 
 
1824
 
1825
  // load weights
1826
  {
@@ -1883,11 +1934,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1883
  return false;
1884
  }
1885
 
1886
- //ggml_backend_t backend = wctx.backend;
1887
-
1888
- //printf("%s: [%5.5s] %s\n", __func__, ggml_backend_name(backend), name.c_str());
1889
-
1890
- if (ggml_backend_buffer_is_host(model.buffer)) {
1891
  // for the CPU and Metal backend, we can read directly into the tensor
1892
  loader->read(loader->context, tensor->data, ggml_nbytes(tensor));
1893
  BYTESWAP_TENSOR(tensor);
@@ -1900,7 +1947,6 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1900
  ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor));
1901
  }
1902
 
1903
- //printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype), ggml_nbytes(tensor)/1e6);
1904
  total_size += ggml_nbytes(tensor);
1905
  model.n_loaded++;
1906
  }
@@ -1915,7 +1961,9 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1915
  }
1916
  }
1917
 
1918
- ggml_backend_buffer_set_usage(model.buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
 
 
1919
 
1920
  wctx.t_load_us = ggml_time_us() - t_start_us;
1921
 
@@ -3806,9 +3854,13 @@ void whisper_free_state(struct whisper_state * state) {
3806
 
3807
  void whisper_free(struct whisper_context * ctx) {
3808
  if (ctx) {
3809
- ggml_free(ctx->model.ctx);
 
 
3810
 
3811
- ggml_backend_buffer_free(ctx->model.buffer);
 
 
3812
 
3813
  whisper_free_state(ctx->state);
3814
 
 
1
  #include "whisper.h"
2
+ #include "whisper-arch.h"
3
 
4
  #include "ggml.h"
5
  #include "ggml-cpp.h"
 
19
  #include <cassert>
20
  #define _USE_MATH_DEFINES
21
  #include <cmath>
22
+ #include <climits>
23
  #include <codecvt>
24
  #include <cstdarg>
25
  #include <cstdio>
 
145
  #define WHISPER_MAX_DECODERS 8
146
  #define WHISPER_MAX_NODES 4096
147
 
148
+ static std::string format(const char * fmt, ...) {
149
+ va_list ap;
150
+ va_list ap2;
151
+ va_start(ap, fmt);
152
+ va_copy(ap2, ap);
153
+ int size = vsnprintf(NULL, 0, fmt, ap);
154
+ GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
155
+ std::vector<char> buf(size + 1);
156
+ int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
157
+ GGML_ASSERT(size2 == size);
158
+ va_end(ap2);
159
+ va_end(ap);
160
+ return std::string(buf.data(), size);
161
+ }
162
+
163
  //
164
  // ggml helpers
165
  //
 
795
  std::vector<whisper_layer_decoder> layers_decoder;
796
 
797
  // ggml context that contains all the meta information about the model tensors
798
+ std::vector<ggml_context *> ctxs;
799
 
800
  // the model backend data is read-only and can be shared between processors
801
+ std::vector<ggml_backend_buffer_t> buffers;
802
 
803
  // tensors
804
  int n_loaded;
 
1381
  return result;
1382
  }
1383
 
1384
+ using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
 
1385
 
1386
+ static buft_list_t make_buft_list(whisper_context_params & params) {
1387
+ // Prio order: GPU -> CPU Extra -> CPU
1388
+ buft_list_t buft_list;
1389
 
1390
+ // GPU
1391
+ if (params.use_gpu) {
1392
+ int cnt = 0;
1393
+ for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
1394
+ ggml_backend_dev_t dev = ggml_backend_dev_get(i);
1395
+ if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
1396
+ if (cnt == 0 || cnt == params.gpu_device) {
1397
+ auto * buft = ggml_backend_dev_buffer_type(dev);
1398
+ if (buft) {
1399
+ buft_list.emplace_back(dev, buft);
1400
+ }
1401
+ }
1402
+
1403
+ if (++cnt > params.gpu_device) {
1404
+ break;
1405
+ }
1406
  }
1407
+ }
1408
+ }
1409
+
1410
+ // CPU Extra
1411
+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
1412
+ auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
1413
+ auto get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
1414
+ ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
1415
+ if (get_extra_bufts_fn) {
1416
+ ggml_backend_buffer_type_t * extra_bufts = get_extra_bufts_fn(cpu_dev);
1417
+ while (extra_bufts && *extra_bufts) {
1418
+ buft_list.emplace_back(cpu_dev, *extra_bufts);
1419
+ ++extra_bufts;
1420
+ }
1421
+ }
1422
+
1423
+ // CPU
1424
+ buft_list.emplace_back(cpu_dev, ggml_backend_cpu_buffer_type());
1425
+
1426
+ return buft_list;
1427
+ }
1428
+
1429
+ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
1430
+ bool op_supported = true;
1431
+
1432
+ if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU ||
1433
+ (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU && buft == ggml_backend_cpu_buffer_type())) {
1434
+ // GPU and default CPU backend support all operators
1435
+ op_supported = true;
1436
+ } else {
1437
+ switch (op) {
1438
+ // The current extra_buffer_type implementations only support GGML_OP_MUL_MAT
1439
+ case GGML_OP_MUL_MAT: {
1440
+ ggml_init_params params = {
1441
+ /*.mem_size =*/ 2 * ggml_tensor_overhead(),
1442
+ /*.mem_buffer =*/ nullptr,
1443
+ /*.no_alloc =*/ true,
1444
+ };
1445
+
1446
+ ggml_context_ptr ctx_ptr { ggml_init(params) };
1447
+ if (!ctx_ptr) {
1448
+ throw std::runtime_error("failed to create ggml context");
1449
+ }
1450
+ ggml_context * ctx = ctx_ptr.get();
1451
 
1452
+ ggml_tensor * op_tensor = nullptr;
1453
+
1454
+ int64_t n_ctx = hparams.n_audio_ctx;
1455
+ ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], n_ctx, w->ne[2], w->ne[3]);
1456
+ op_tensor = ggml_mul_mat(ctx, w, b);
1457
+
1458
+ // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
1459
+ GGML_ASSERT(w->buffer == nullptr);
1460
+ w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
1461
+ op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
1462
+ ggml_backend_buffer_free(w->buffer);
1463
+ w->buffer = nullptr;
1464
+ break;
1465
+ }
1466
+ default: {
1467
+ op_supported = false;
1468
  break;
1469
  }
1470
+ };
1471
+ }
1472
+
1473
+ return op_supported;
1474
+ }
1475
+
1476
+ static ggml_backend_buffer_type_t select_weight_buft(const whisper_hparams & hparams, ggml_tensor * w, ggml_op op, buft_list_t buft_list) {
1477
+ GGML_ASSERT(!buft_list.empty());
1478
+ for (const auto & p : buft_list) {
1479
+ ggml_backend_dev_t dev = p.first;
1480
+ ggml_backend_buffer_type_t buft = p.second;
1481
+ if (weight_buft_supported(hparams, w, op, buft, dev)) {
1482
+ return buft;
1483
  }
1484
  }
1485
 
1486
+ return nullptr;
1487
  }
1488
 
1489
  // load the model from a ggml file
 
1692
  const ggml_type wtype = wctx.wtype;
1693
  const ggml_type vtype = wctx.wtype == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16; // conv type
1694
 
1695
+ const auto & hparams = model.hparams;
 
 
1696
 
1697
+ const int n_audio_layer = hparams.n_audio_layer;
1698
+ const int n_text_layer = hparams.n_text_layer;
1699
 
1700
+ const size_t n_tensors = 10 /* input */ + 15 + 15*n_audio_layer + 24*n_text_layer;
1701
 
1702
+ std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
1703
+ auto get_ctx = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
1704
+ auto it = ctx_map.find(buft);
1705
+ if (it == ctx_map.end()) {
1706
+ ggml_init_params params = {
1707
+ /*.mem_size =*/ n_tensors * ggml_tensor_overhead(),
1708
+ /*.mem_buffer =*/ nullptr,
1709
+ /*.no_alloc =*/ true,
1710
+ };
1711
 
1712
+ ggml_context * ctx = ggml_init(params);
1713
+ if (!ctx) {
1714
+ throw std::runtime_error("failed to create ggml context");
1715
+ }
1716
+
1717
+ ctx_map[buft] = ctx;
1718
+ model.ctxs.emplace_back(ctx);
1719
+
1720
+ return ctx;
1721
  }
1722
+
1723
+ return it->second;
1724
+ };
1725
+
1726
+ // Create a list of available bufts, in priority order
1727
+ buft_list_t buft_list = make_buft_list(wctx.params);
1728
+
1729
+ auto create_tensor = [&](asr_tensor type, asr_system system, ggml_tensor * meta, int layer = 0) -> ggml_tensor * {
1730
+ ggml_op op = ASR_TENSOR_INFO.at(type);
1731
+ ggml_backend_buffer_type_t buft = select_weight_buft(hparams, meta, op, buft_list);
1732
+ if (!buft) {
1733
+ throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", ASR_TENSOR_NAMES.at(system).at(type)));
1734
+ }
1735
+
1736
+ ggml_context * ctx = get_ctx(buft);
1737
+ ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
1738
+
1739
+ model.tensors[format(ASR_TENSOR_NAMES.at(system).at(type), layer)] = tensor;
1740
+
1741
+ return tensor;
1742
+ };
1743
+
1744
 
1745
  // prepare tensors for the weights
1746
  {
1747
+ ggml_init_params params = {
1748
+ /*.mem_size =*/ n_tensors * ggml_tensor_overhead(),
1749
+ /*.mem_buffer =*/ nullptr,
1750
+ /*.no_alloc =*/ true,
1751
+ };
1752
+
1753
+ ggml_context * ctx = ggml_init(params);
1754
 
1755
  const auto & hparams = model.hparams;
1756
 
 
1770
  model.layers_decoder.resize(n_text_layer);
1771
 
1772
  // encoder
1773
+ model.e_pe = create_tensor(ASR_TENSOR_ENC_POS_EMBD, ASR_SYSTEM_ENCODER, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_state, n_audio_ctx));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1774
 
1775
+ model.e_conv_1_w = create_tensor(ASR_TENSOR_CONV1_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_3d(ctx, vtype, 3, n_mels, n_audio_state));
1776
+ model.e_conv_1_b = create_tensor(ASR_TENSOR_CONV1_BIAS, ASR_SYSTEM_ENCODER, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state));
1777
 
1778
+ model.e_conv_2_w = create_tensor(ASR_TENSOR_CONV2_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_3d(ctx, vtype, 3, n_audio_state, n_audio_state));
1779
+ model.e_conv_2_b = create_tensor(ASR_TENSOR_CONV2_BIAS, ASR_SYSTEM_ENCODER, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state));
1780
 
1781
+ model.e_ln_w = create_tensor(ASR_TENSOR_LN_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state));
1782
+ model.e_ln_b = create_tensor(ASR_TENSOR_LN_POST_BIAS, ASR_SYSTEM_ENCODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state));
1783
 
1784
+ for (int i = 0; i < n_audio_layer; ++i) {
1785
+ auto & layer = model.layers_encoder[i];
1786
 
1787
+ layer.mlp_ln_w = create_tensor(ASR_TENSOR_MLP_LN_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state), i);
1788
+ layer.mlp_ln_b = create_tensor(ASR_TENSOR_MLP_LN_BIAS, ASR_SYSTEM_ENCODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state), i);
1789
 
1790
+ layer.mlp_0_w = create_tensor(ASR_TENSOR_MLP_0_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_2d(ctx, wtype, n_audio_state, 4*n_audio_state), i);
1791
+ layer.mlp_0_b = create_tensor(ASR_TENSOR_MLP_0_BIAS, ASR_SYSTEM_ENCODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_audio_state), i);
1792
 
1793
+ layer.mlp_1_w = create_tensor(ASR_TENSOR_MLP_2_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_2d(ctx, wtype, 4*n_audio_state, n_audio_state), i);
1794
+ layer.mlp_1_b = create_tensor(ASR_TENSOR_MLP_2_BIAS, ASR_SYSTEM_ENCODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state), i);
1795
 
1796
+ layer.attn_ln_0_w = create_tensor(ASR_TENSOR_ATTN_LN_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state), i);
1797
+ layer.attn_ln_0_b = create_tensor(ASR_TENSOR_ATTN_LN_BIAS, ASR_SYSTEM_ENCODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state), i);
1798
 
1799
+ layer.attn_q_w = create_tensor(ASR_TENSOR_ATTN_QUERY_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state), i);
1800
+ layer.attn_q_b = create_tensor(ASR_TENSOR_ATTN_QUERY_BIAS, ASR_SYSTEM_ENCODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state), i);
1801
 
1802
+ layer.attn_k_w = create_tensor(ASR_TENSOR_ATTN_KEY_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state), i);
 
1803
 
1804
+ layer.attn_v_w = create_tensor(ASR_TENSOR_ATTN_VALUE_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state), i);
1805
+ layer.attn_v_b = create_tensor(ASR_TENSOR_ATTN_VALUE_BIAS, ASR_SYSTEM_ENCODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state), i);
 
1806
 
1807
+ layer.attn_ln_1_w = create_tensor(ASR_TENSOR_ATTN_OUT_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state), i);
1808
+ layer.attn_ln_1_b = create_tensor(ASR_TENSOR_ATTN_OUT_BIAS, ASR_SYSTEM_ENCODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state), i);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1809
  }
1810
 
1811
  // decoder
1812
+ model.d_pe = create_tensor(ASR_TENSOR_DEC_POS_EMBD, ASR_SYSTEM_DECODER, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_text_state, n_text_ctx));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1813
 
1814
+ model.d_te = create_tensor(ASR_TENSOR_DEC_TOKEN_EMBD_WEIGHT, ASR_SYSTEM_DECODER, ggml_new_tensor_2d(ctx, wtype, n_text_state, n_vocab));
 
1815
 
1816
+ model.d_ln_w = create_tensor(ASR_TENSOR_LN_WEIGHT, ASR_SYSTEM_DECODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state));
1817
+ model.d_ln_b = create_tensor(ASR_TENSOR_LN_BIAS, ASR_SYSTEM_DECODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state));
1818
 
1819
+ for (int i = 0; i < n_text_layer; ++i) {
1820
+ auto & layer = model.layers_decoder[i];
1821
 
1822
+ layer.mlp_ln_w = create_tensor(ASR_TENSOR_MLP_LN_WEIGHT, ASR_SYSTEM_DECODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state), i);
1823
+ layer.mlp_ln_b = create_tensor(ASR_TENSOR_MLP_LN_BIAS, ASR_SYSTEM_DECODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state), i);
1824
 
1825
+ layer.mlp_0_w = create_tensor(ASR_TENSOR_MLP_0_WEIGHT, ASR_SYSTEM_DECODER, ggml_new_tensor_2d(ctx, wtype, n_text_state, 4*n_text_state), i);
1826
+ layer.mlp_0_b = create_tensor(ASR_TENSOR_MLP_0_BIAS, ASR_SYSTEM_DECODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_text_state), i);
1827
 
1828
+ layer.mlp_1_w = create_tensor(ASR_TENSOR_MLP_2_WEIGHT, ASR_SYSTEM_DECODER, ggml_new_tensor_2d(ctx, wtype, 4*n_text_state, n_text_state), i);
1829
+ layer.mlp_1_b = create_tensor(ASR_TENSOR_MLP_2_BIAS, ASR_SYSTEM_DECODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state), i);
1830
 
1831
+ layer.attn_ln_0_w = create_tensor(ASR_TENSOR_ATTN_LN_WEIGHT, ASR_SYSTEM_DECODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state), i);
1832
+ layer.attn_ln_0_b = create_tensor(ASR_TENSOR_ATTN_LN_BIAS, ASR_SYSTEM_DECODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state), i);
 
1833
 
1834
+ layer.attn_q_w = create_tensor(ASR_TENSOR_ATTN_QUERY_WEIGHT, ASR_SYSTEM_DECODER, ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state), i);
1835
+ layer.attn_q_b = create_tensor(ASR_TENSOR_ATTN_QUERY_BIAS, ASR_SYSTEM_DECODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state), i);
1836
 
1837
+ layer.attn_k_w = create_tensor(ASR_TENSOR_ATTN_KEY_WEIGHT, ASR_SYSTEM_DECODER, ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state), i);
 
1838
 
1839
+ layer.attn_v_w = create_tensor(ASR_TENSOR_ATTN_VALUE_WEIGHT, ASR_SYSTEM_DECODER, ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state), i);
1840
+ layer.attn_v_b = create_tensor(ASR_TENSOR_ATTN_VALUE_BIAS, ASR_SYSTEM_DECODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state), i);
1841
 
1842
+ layer.attn_ln_1_w = create_tensor(ASR_TENSOR_ATTN_OUT_WEIGHT, ASR_SYSTEM_DECODER, ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state), i);
1843
+ layer.attn_ln_1_b = create_tensor(ASR_TENSOR_ATTN_OUT_BIAS, ASR_SYSTEM_DECODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state), i);
1844
 
1845
+ layer.cross_attn_ln_0_w = create_tensor(ASR_TENSOR_ATTN_LN_WEIGHT, ASR_SYSTEM_CROSS, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state), i);
1846
+ layer.cross_attn_ln_0_b = create_tensor(ASR_TENSOR_ATTN_LN_BIAS, ASR_SYSTEM_CROSS, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state), i);
1847
 
1848
+ layer.cross_attn_q_w = create_tensor(ASR_TENSOR_ATTN_QUERY_WEIGHT, ASR_SYSTEM_CROSS, ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state), i);
1849
+ layer.cross_attn_q_b = create_tensor(ASR_TENSOR_ATTN_QUERY_BIAS, ASR_SYSTEM_CROSS, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state), i);
1850
 
1851
+ layer.cross_attn_k_w = create_tensor(ASR_TENSOR_ATTN_KEY_WEIGHT, ASR_SYSTEM_CROSS, ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state), i);
 
1852
 
1853
+ layer.cross_attn_v_w = create_tensor(ASR_TENSOR_ATTN_VALUE_WEIGHT, ASR_SYSTEM_CROSS, ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state), i);
1854
+ layer.cross_attn_v_b = create_tensor(ASR_TENSOR_ATTN_VALUE_BIAS, ASR_SYSTEM_CROSS, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state), i);
1855
 
1856
+ layer.cross_attn_ln_1_w = create_tensor(ASR_TENSOR_ATTN_OUT_WEIGHT, ASR_SYSTEM_CROSS, ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state), i);
1857
+ layer.cross_attn_ln_1_b = create_tensor(ASR_TENSOR_ATTN_OUT_BIAS, ASR_SYSTEM_CROSS, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state), i);
 
 
 
 
 
 
 
 
 
1858
  }
1859
+
1860
+ ggml_free(ctx);
1861
  }
1862
 
1863
  // allocate tensors in the backend buffers
1864
+ for (auto & p : ctx_map) {
1865
+ ggml_backend_buffer_type_t buft = p.first;
1866
+ ggml_context * ctx = p.second;
1867
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
1868
+ if (buf) {
1869
+ model.buffers.emplace_back(buf);
1870
 
1871
+ size_t size_main = ggml_backend_buffer_get_size(buf);
1872
+ WHISPER_LOG_INFO("%s: %12s total size = %8.2f MB\n", __func__, ggml_backend_buffer_name(buf), size_main / 1e6);
1873
+ }
1874
+ }
1875
 
1876
  // load weights
1877
  {
 
1934
  return false;
1935
  }
1936
 
1937
+ if (ggml_backend_buffer_is_host(tensor->buffer)) {
 
 
 
 
1938
  // for the CPU and Metal backend, we can read directly into the tensor
1939
  loader->read(loader->context, tensor->data, ggml_nbytes(tensor));
1940
  BYTESWAP_TENSOR(tensor);
 
1947
  ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor));
1948
  }
1949
 
 
1950
  total_size += ggml_nbytes(tensor);
1951
  model.n_loaded++;
1952
  }
 
1961
  }
1962
  }
1963
 
1964
+ for (auto & buf : model.buffers) {
1965
+ ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
1966
+ }
1967
 
1968
  wctx.t_load_us = ggml_time_us() - t_start_us;
1969
 
 
3854
 
3855
  void whisper_free(struct whisper_context * ctx) {
3856
  if (ctx) {
3857
+ for (ggml_context * context : ctx->model.ctxs) {
3858
+ ggml_free(context);
3859
+ }
3860
 
3861
+ for (ggml_backend_buffer_t buf : ctx->model.buffers) {
3862
+ ggml_backend_buffer_free(buf);
3863
+ }
3864
 
3865
  whisper_free_state(ctx->state);
3866