Spaces:
Running
Running
Dan Johansson commited on
whisper : add support for backends with multiple ggml_backend_buffer_type (#2863)
Browse files* whisper : add support for ggml_backend_buffer_type
Signed-off-by: Dan Johansson <dan.johansson@arm.com>
* fix compile error when building on Ubuntu
Signed-off-by: Dan Johansson <dan.johansson@arm.com>
* remove copyright header from include file
Signed-off-by: Dan Johansson <dan.johansson@arm.com>
---------
Signed-off-by: Dan Johansson <dan.johansson@arm.com>
- src/CMakeLists.txt +1 -0
- src/whisper-arch.h +141 -0
- src/whisper.cpp +240 -188
src/CMakeLists.txt
CHANGED
|
@@ -102,6 +102,7 @@ endif()
|
|
| 102 |
|
| 103 |
add_library(whisper
|
| 104 |
../include/whisper.h
|
|
|
|
| 105 |
whisper.cpp
|
| 106 |
)
|
| 107 |
|
|
|
|
| 102 |
|
| 103 |
add_library(whisper
|
| 104 |
../include/whisper.h
|
| 105 |
+
whisper-arch.h
|
| 106 |
whisper.cpp
|
| 107 |
)
|
| 108 |
|
src/whisper-arch.h
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include "ggml.h"
|
| 4 |
+
|
| 5 |
+
#include <map>
|
| 6 |
+
|
| 7 |
+
enum asr_tensor {
|
| 8 |
+
ASR_TENSOR_ENC_POS_EMBD,
|
| 9 |
+
ASR_TENSOR_DEC_POS_EMBD,
|
| 10 |
+
ASR_TENSOR_DEC_TOKEN_EMBD_WEIGHT,
|
| 11 |
+
ASR_TENSOR_LN_WEIGHT,
|
| 12 |
+
ASR_TENSOR_LN_BIAS,
|
| 13 |
+
ASR_TENSOR_CONV1_WEIGHT,
|
| 14 |
+
ASR_TENSOR_CONV1_BIAS,
|
| 15 |
+
ASR_TENSOR_CONV2_WEIGHT,
|
| 16 |
+
ASR_TENSOR_CONV2_BIAS,
|
| 17 |
+
ASR_TENSOR_LN_POST_WEIGHT,
|
| 18 |
+
ASR_TENSOR_LN_POST_BIAS,
|
| 19 |
+
ASR_TENSOR_MLP_LN_WEIGHT,
|
| 20 |
+
ASR_TENSOR_MLP_LN_BIAS,
|
| 21 |
+
ASR_TENSOR_MLP_0_WEIGHT,
|
| 22 |
+
ASR_TENSOR_MLP_0_BIAS,
|
| 23 |
+
ASR_TENSOR_MLP_2_WEIGHT,
|
| 24 |
+
ASR_TENSOR_MLP_2_BIAS,
|
| 25 |
+
ASR_TENSOR_ATTN_LN_WEIGHT,
|
| 26 |
+
ASR_TENSOR_ATTN_LN_BIAS,
|
| 27 |
+
ASR_TENSOR_ATTN_QUERY_WEIGHT,
|
| 28 |
+
ASR_TENSOR_ATTN_QUERY_BIAS,
|
| 29 |
+
ASR_TENSOR_ATTN_KEY_WEIGHT,
|
| 30 |
+
ASR_TENSOR_ATTN_VALUE_WEIGHT,
|
| 31 |
+
ASR_TENSOR_ATTN_VALUE_BIAS,
|
| 32 |
+
ASR_TENSOR_ATTN_OUT_WEIGHT,
|
| 33 |
+
ASR_TENSOR_ATTN_OUT_BIAS,
|
| 34 |
+
};
|
| 35 |
+
|
| 36 |
+
enum asr_system {
|
| 37 |
+
ASR_SYSTEM_ENCODER,
|
| 38 |
+
ASR_SYSTEM_DECODER,
|
| 39 |
+
ASR_SYSTEM_CROSS
|
| 40 |
+
};
|
| 41 |
+
|
| 42 |
+
static const std::map<asr_system, std::map<asr_tensor, const char *>> ASR_TENSOR_NAMES = {
|
| 43 |
+
{
|
| 44 |
+
ASR_SYSTEM_ENCODER,
|
| 45 |
+
{
|
| 46 |
+
{ASR_TENSOR_ENC_POS_EMBD, "encoder.positional_embedding"},
|
| 47 |
+
{ASR_TENSOR_CONV1_WEIGHT, "encoder.conv1.weight"},
|
| 48 |
+
{ASR_TENSOR_CONV1_BIAS, "encoder.conv1.bias"},
|
| 49 |
+
{ASR_TENSOR_CONV2_WEIGHT, "encoder.conv2.weight"},
|
| 50 |
+
{ASR_TENSOR_CONV2_BIAS, "encoder.conv2.bias"},
|
| 51 |
+
{ASR_TENSOR_LN_WEIGHT, "encoder.ln_post.weight"},
|
| 52 |
+
{ASR_TENSOR_LN_POST_BIAS, "encoder.ln_post.bias"},
|
| 53 |
+
{ASR_TENSOR_MLP_LN_WEIGHT, "encoder.blocks.%d.mlp_ln.weight"},
|
| 54 |
+
{ASR_TENSOR_MLP_LN_BIAS, "encoder.blocks.%d.mlp_ln.bias"},
|
| 55 |
+
{ASR_TENSOR_MLP_0_WEIGHT, "encoder.blocks.%d.mlp.0.weight"},
|
| 56 |
+
{ASR_TENSOR_MLP_0_BIAS, "encoder.blocks.%d.mlp.0.bias"},
|
| 57 |
+
{ASR_TENSOR_MLP_2_WEIGHT, "encoder.blocks.%d.mlp.2.weight"},
|
| 58 |
+
{ASR_TENSOR_MLP_2_BIAS, "encoder.blocks.%d.mlp.2.bias"},
|
| 59 |
+
{ASR_TENSOR_ATTN_LN_WEIGHT, "encoder.blocks.%d.attn_ln.weight"},
|
| 60 |
+
{ASR_TENSOR_ATTN_LN_BIAS, "encoder.blocks.%d.attn_ln.bias"},
|
| 61 |
+
{ASR_TENSOR_ATTN_QUERY_WEIGHT, "encoder.blocks.%d.attn.query.weight"},
|
| 62 |
+
{ASR_TENSOR_ATTN_QUERY_BIAS, "encoder.blocks.%d.attn.query.bias"},
|
| 63 |
+
{ASR_TENSOR_ATTN_KEY_WEIGHT, "encoder.blocks.%d.attn.key.weight"},
|
| 64 |
+
{ASR_TENSOR_ATTN_VALUE_WEIGHT, "encoder.blocks.%d.attn.value.weight"},
|
| 65 |
+
{ASR_TENSOR_ATTN_VALUE_BIAS, "encoder.blocks.%d.attn.value.bias"},
|
| 66 |
+
{ASR_TENSOR_ATTN_OUT_WEIGHT, "encoder.blocks.%d.attn.out.weight"},
|
| 67 |
+
{ASR_TENSOR_ATTN_OUT_BIAS, "encoder.blocks.%d.attn.out.bias"},
|
| 68 |
+
},
|
| 69 |
+
},
|
| 70 |
+
{
|
| 71 |
+
ASR_SYSTEM_DECODER,
|
| 72 |
+
{
|
| 73 |
+
{ASR_TENSOR_DEC_POS_EMBD, "decoder.positional_embedding"},
|
| 74 |
+
{ASR_TENSOR_DEC_TOKEN_EMBD_WEIGHT, "decoder.token_embedding.weight"},
|
| 75 |
+
{ASR_TENSOR_LN_WEIGHT, "decoder.ln.weight"},
|
| 76 |
+
{ASR_TENSOR_LN_BIAS, "decoder.ln.bias"},
|
| 77 |
+
|
| 78 |
+
{ASR_TENSOR_MLP_LN_WEIGHT, "decoder.blocks.%d.mlp_ln.weight"},
|
| 79 |
+
{ASR_TENSOR_MLP_LN_BIAS, "decoder.blocks.%d.mlp_ln.bias"},
|
| 80 |
+
{ASR_TENSOR_MLP_0_WEIGHT, "decoder.blocks.%d.mlp.0.weight"},
|
| 81 |
+
{ASR_TENSOR_MLP_0_BIAS, "decoder.blocks.%d.mlp.0.bias"},
|
| 82 |
+
{ASR_TENSOR_MLP_2_WEIGHT, "decoder.blocks.%d.mlp.2.weight"},
|
| 83 |
+
{ASR_TENSOR_MLP_2_BIAS, "decoder.blocks.%d.mlp.2.bias"},
|
| 84 |
+
{ASR_TENSOR_ATTN_LN_WEIGHT, "decoder.blocks.%d.attn_ln.weight"},
|
| 85 |
+
{ASR_TENSOR_ATTN_LN_BIAS, "decoder.blocks.%d.attn_ln.bias"},
|
| 86 |
+
{ASR_TENSOR_ATTN_QUERY_WEIGHT, "decoder.blocks.%d.attn.query.weight"},
|
| 87 |
+
{ASR_TENSOR_ATTN_QUERY_BIAS, "decoder.blocks.%d.attn.query.bias"},
|
| 88 |
+
{ASR_TENSOR_ATTN_KEY_WEIGHT, "decoder.blocks.%d.attn.key.weight"},
|
| 89 |
+
{ASR_TENSOR_ATTN_VALUE_WEIGHT, "decoder.blocks.%d.attn.value.weight"},
|
| 90 |
+
{ASR_TENSOR_ATTN_VALUE_BIAS, "decoder.blocks.%d.attn.value.bias"},
|
| 91 |
+
{ASR_TENSOR_ATTN_OUT_WEIGHT, "decoder.blocks.%d.attn.out.weight"},
|
| 92 |
+
{ASR_TENSOR_ATTN_OUT_BIAS, "decoder.blocks.%d.attn.out.bias"},
|
| 93 |
+
},
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
ASR_SYSTEM_CROSS,
|
| 97 |
+
{
|
| 98 |
+
{ASR_TENSOR_ATTN_LN_WEIGHT, "decoder.blocks.%d.cross_attn_ln.weight"},
|
| 99 |
+
{ASR_TENSOR_ATTN_LN_BIAS, "decoder.blocks.%d.cross_attn_ln.bias"},
|
| 100 |
+
{ASR_TENSOR_ATTN_QUERY_WEIGHT, "decoder.blocks.%d.cross_attn.query.weight"},
|
| 101 |
+
{ASR_TENSOR_ATTN_QUERY_BIAS, "decoder.blocks.%d.cross_attn.query.bias"},
|
| 102 |
+
{ASR_TENSOR_ATTN_KEY_WEIGHT, "decoder.blocks.%d.cross_attn.key.weight"},
|
| 103 |
+
{ASR_TENSOR_ATTN_VALUE_WEIGHT, "decoder.blocks.%d.cross_attn.value.weight"},
|
| 104 |
+
{ASR_TENSOR_ATTN_VALUE_BIAS, "decoder.blocks.%d.cross_attn.value.bias"},
|
| 105 |
+
{ASR_TENSOR_ATTN_OUT_WEIGHT, "decoder.blocks.%d.cross_attn.out.weight"},
|
| 106 |
+
{ASR_TENSOR_ATTN_OUT_BIAS, "decoder.blocks.%d.cross_attn.out.bias"},
|
| 107 |
+
},
|
| 108 |
+
},
|
| 109 |
+
};
|
| 110 |
+
|
| 111 |
+
static const std::map<asr_tensor, ggml_op> ASR_TENSOR_INFO = {
|
| 112 |
+
{ASR_TENSOR_ENC_POS_EMBD, GGML_OP_ADD},
|
| 113 |
+
{ASR_TENSOR_DEC_POS_EMBD, GGML_OP_GET_ROWS},
|
| 114 |
+
// Note: ASR_TENSOR_DEC_TOKEN_EMBD_WEIGHT is also used by GGML_OP_MAT_MUL. Need to figure out a way how to handle
|
| 115 |
+
// weight tensors that are used by multiple different operators when extra_buffer_type implementations accelerate
|
| 116 |
+
// more than just GGML_OP_MUL_MAT.
|
| 117 |
+
{ASR_TENSOR_DEC_TOKEN_EMBD_WEIGHT, GGML_OP_GET_ROWS},
|
| 118 |
+
{ASR_TENSOR_LN_WEIGHT, GGML_OP_MUL},
|
| 119 |
+
{ASR_TENSOR_LN_BIAS, GGML_OP_ADD},
|
| 120 |
+
{ASR_TENSOR_CONV1_WEIGHT, GGML_OP_IM2COL},
|
| 121 |
+
{ASR_TENSOR_CONV1_BIAS, GGML_OP_ADD},
|
| 122 |
+
{ASR_TENSOR_CONV2_WEIGHT, GGML_OP_IM2COL},
|
| 123 |
+
{ASR_TENSOR_CONV2_BIAS, GGML_OP_ADD},
|
| 124 |
+
{ASR_TENSOR_LN_POST_WEIGHT, GGML_OP_MUL},
|
| 125 |
+
{ASR_TENSOR_LN_POST_BIAS, GGML_OP_ADD},
|
| 126 |
+
{ASR_TENSOR_MLP_LN_WEIGHT, GGML_OP_MUL},
|
| 127 |
+
{ASR_TENSOR_MLP_LN_BIAS, GGML_OP_ADD},
|
| 128 |
+
{ASR_TENSOR_MLP_0_WEIGHT, GGML_OP_MUL_MAT},
|
| 129 |
+
{ASR_TENSOR_MLP_0_BIAS, GGML_OP_ADD},
|
| 130 |
+
{ASR_TENSOR_MLP_2_WEIGHT, GGML_OP_MUL_MAT},
|
| 131 |
+
{ASR_TENSOR_MLP_2_BIAS, GGML_OP_ADD},
|
| 132 |
+
{ASR_TENSOR_ATTN_LN_WEIGHT, GGML_OP_MUL},
|
| 133 |
+
{ASR_TENSOR_ATTN_LN_BIAS, GGML_OP_ADD},
|
| 134 |
+
{ASR_TENSOR_ATTN_QUERY_WEIGHT, GGML_OP_MUL_MAT},
|
| 135 |
+
{ASR_TENSOR_ATTN_QUERY_BIAS, GGML_OP_ADD},
|
| 136 |
+
{ASR_TENSOR_ATTN_KEY_WEIGHT, GGML_OP_MUL_MAT},
|
| 137 |
+
{ASR_TENSOR_ATTN_VALUE_WEIGHT, GGML_OP_MUL_MAT},
|
| 138 |
+
{ASR_TENSOR_ATTN_VALUE_BIAS, GGML_OP_ADD},
|
| 139 |
+
{ASR_TENSOR_ATTN_OUT_WEIGHT, GGML_OP_MUL_MAT},
|
| 140 |
+
{ASR_TENSOR_ATTN_OUT_BIAS, GGML_OP_ADD},
|
| 141 |
+
};
|
src/whisper.cpp
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
#include "whisper.h"
|
|
|
|
| 2 |
|
| 3 |
#include "ggml.h"
|
| 4 |
#include "ggml-cpp.h"
|
|
@@ -18,6 +19,7 @@
|
|
| 18 |
#include <cassert>
|
| 19 |
#define _USE_MATH_DEFINES
|
| 20 |
#include <cmath>
|
|
|
|
| 21 |
#include <codecvt>
|
| 22 |
#include <cstdarg>
|
| 23 |
#include <cstdio>
|
|
@@ -143,6 +145,21 @@ static void whisper_log_callback_default(ggml_log_level level, const char * text
|
|
| 143 |
#define WHISPER_MAX_DECODERS 8
|
| 144 |
#define WHISPER_MAX_NODES 4096
|
| 145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
//
|
| 147 |
// ggml helpers
|
| 148 |
//
|
|
@@ -778,10 +795,10 @@ struct whisper_model {
|
|
| 778 |
std::vector<whisper_layer_decoder> layers_decoder;
|
| 779 |
|
| 780 |
// ggml context that contains all the meta information about the model tensors
|
| 781 |
-
|
| 782 |
|
| 783 |
// the model backend data is read-only and can be shared between processors
|
| 784 |
-
ggml_backend_buffer_t
|
| 785 |
|
| 786 |
// tensors
|
| 787 |
int n_loaded;
|
|
@@ -1364,28 +1381,109 @@ static std::vector<ggml_backend_t> whisper_backend_init(const whisper_context_pa
|
|
| 1364 |
return result;
|
| 1365 |
}
|
| 1366 |
|
| 1367 |
-
|
| 1368 |
-
ggml_backend_buffer_type_t result = ggml_backend_cpu_buffer_type();
|
| 1369 |
|
| 1370 |
-
|
| 1371 |
-
|
| 1372 |
-
|
| 1373 |
|
| 1374 |
-
|
| 1375 |
-
|
| 1376 |
-
|
| 1377 |
-
|
| 1378 |
-
|
| 1379 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1380 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1381 |
|
| 1382 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1383 |
break;
|
| 1384 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1385 |
}
|
| 1386 |
}
|
| 1387 |
|
| 1388 |
-
return
|
| 1389 |
}
|
| 1390 |
|
| 1391 |
// load the model from a ggml file
|
|
@@ -1594,31 +1692,65 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 1594 |
const ggml_type wtype = wctx.wtype;
|
| 1595 |
const ggml_type vtype = wctx.wtype == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16; // conv type
|
| 1596 |
|
| 1597 |
-
|
| 1598 |
-
{
|
| 1599 |
-
const auto & hparams = model.hparams;
|
| 1600 |
|
| 1601 |
-
|
| 1602 |
-
|
| 1603 |
|
| 1604 |
-
|
| 1605 |
|
| 1606 |
-
|
| 1607 |
-
|
| 1608 |
-
|
| 1609 |
-
|
| 1610 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1611 |
|
| 1612 |
-
|
| 1613 |
-
|
| 1614 |
-
|
| 1615 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1616 |
}
|
| 1617 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1618 |
|
| 1619 |
// prepare tensors for the weights
|
| 1620 |
{
|
| 1621 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1622 |
|
| 1623 |
const auto & hparams = model.hparams;
|
| 1624 |
|
|
@@ -1638,189 +1770,108 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 1638 |
model.layers_decoder.resize(n_text_layer);
|
| 1639 |
|
| 1640 |
// encoder
|
| 1641 |
-
|
| 1642 |
-
model.e_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_state, n_audio_ctx);
|
| 1643 |
-
|
| 1644 |
-
model.e_conv_1_w = ggml_new_tensor_3d(ctx, vtype, 3, n_mels, n_audio_state);
|
| 1645 |
-
model.e_conv_1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
|
| 1646 |
-
|
| 1647 |
-
model.e_conv_2_w = ggml_new_tensor_3d(ctx, vtype, 3, n_audio_state, n_audio_state);
|
| 1648 |
-
model.e_conv_2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
|
| 1649 |
-
|
| 1650 |
-
model.e_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
| 1651 |
-
model.e_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
| 1652 |
-
|
| 1653 |
-
// map by name
|
| 1654 |
-
model.tensors["encoder.positional_embedding"] = model.e_pe;
|
| 1655 |
-
|
| 1656 |
-
model.tensors["encoder.conv1.weight"] = model.e_conv_1_w;
|
| 1657 |
-
model.tensors["encoder.conv1.bias"] = model.e_conv_1_b;
|
| 1658 |
-
|
| 1659 |
-
model.tensors["encoder.conv2.weight"] = model.e_conv_2_w;
|
| 1660 |
-
model.tensors["encoder.conv2.bias"] = model.e_conv_2_b;
|
| 1661 |
|
| 1662 |
-
|
| 1663 |
-
|
| 1664 |
|
| 1665 |
-
|
| 1666 |
-
|
| 1667 |
|
| 1668 |
-
|
| 1669 |
-
|
| 1670 |
|
| 1671 |
-
|
| 1672 |
-
|
| 1673 |
|
| 1674 |
-
|
| 1675 |
-
|
| 1676 |
|
| 1677 |
-
|
| 1678 |
-
|
| 1679 |
|
| 1680 |
-
|
| 1681 |
-
|
| 1682 |
|
| 1683 |
-
|
|
|
|
| 1684 |
|
| 1685 |
-
|
| 1686 |
-
|
| 1687 |
|
| 1688 |
-
|
| 1689 |
-
layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
| 1690 |
|
| 1691 |
-
|
| 1692 |
-
|
| 1693 |
-
model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.bias"] = layer.mlp_ln_b;
|
| 1694 |
|
| 1695 |
-
|
| 1696 |
-
|
| 1697 |
-
|
| 1698 |
-
model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.weight"] = layer.mlp_1_w;
|
| 1699 |
-
model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.bias"] = layer.mlp_1_b;
|
| 1700 |
-
|
| 1701 |
-
model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.weight"] = layer.attn_ln_0_w;
|
| 1702 |
-
model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.bias"] = layer.attn_ln_0_b;
|
| 1703 |
-
|
| 1704 |
-
model.tensors["encoder.blocks." + std::to_string(i) + ".attn.query.weight"] = layer.attn_q_w;
|
| 1705 |
-
model.tensors["encoder.blocks." + std::to_string(i) + ".attn.query.bias"] = layer.attn_q_b;
|
| 1706 |
-
|
| 1707 |
-
model.tensors["encoder.blocks." + std::to_string(i) + ".attn.key.weight"] = layer.attn_k_w;
|
| 1708 |
-
|
| 1709 |
-
model.tensors["encoder.blocks." + std::to_string(i) + ".attn.value.weight"] = layer.attn_v_w;
|
| 1710 |
-
model.tensors["encoder.blocks." + std::to_string(i) + ".attn.value.bias"] = layer.attn_v_b;
|
| 1711 |
-
|
| 1712 |
-
model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.weight"] = layer.attn_ln_1_w;
|
| 1713 |
-
model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.bias"] = layer.attn_ln_1_b;
|
| 1714 |
-
}
|
| 1715 |
}
|
| 1716 |
|
| 1717 |
// decoder
|
| 1718 |
-
|
| 1719 |
-
model.d_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_text_state, n_text_ctx);
|
| 1720 |
-
|
| 1721 |
-
model.d_te = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_vocab);
|
| 1722 |
-
|
| 1723 |
-
model.d_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
| 1724 |
-
model.d_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
| 1725 |
-
|
| 1726 |
-
// map by name
|
| 1727 |
-
model.tensors["decoder.positional_embedding"] = model.d_pe;
|
| 1728 |
-
|
| 1729 |
-
model.tensors["decoder.token_embedding.weight"] = model.d_te;
|
| 1730 |
-
|
| 1731 |
-
model.tensors["decoder.ln.weight"] = model.d_ln_w;
|
| 1732 |
-
model.tensors["decoder.ln.bias"] = model.d_ln_b;
|
| 1733 |
-
|
| 1734 |
-
for (int i = 0; i < n_text_layer; ++i) {
|
| 1735 |
-
auto & layer = model.layers_decoder[i];
|
| 1736 |
-
|
| 1737 |
-
layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
| 1738 |
-
layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
| 1739 |
-
|
| 1740 |
-
layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, 4*n_text_state);
|
| 1741 |
-
layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_text_state);
|
| 1742 |
-
|
| 1743 |
-
layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype, 4*n_text_state, n_text_state);
|
| 1744 |
-
layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
| 1745 |
-
|
| 1746 |
-
layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
| 1747 |
-
layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
| 1748 |
-
|
| 1749 |
-
layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
| 1750 |
-
layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
| 1751 |
-
|
| 1752 |
-
layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
| 1753 |
-
|
| 1754 |
-
layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
| 1755 |
-
layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
| 1756 |
|
| 1757 |
-
|
| 1758 |
-
layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
| 1759 |
|
| 1760 |
-
|
| 1761 |
-
|
| 1762 |
|
| 1763 |
-
|
| 1764 |
-
|
| 1765 |
|
| 1766 |
-
|
|
|
|
| 1767 |
|
| 1768 |
-
|
| 1769 |
-
|
| 1770 |
|
| 1771 |
-
|
| 1772 |
-
|
| 1773 |
|
| 1774 |
-
|
| 1775 |
-
|
| 1776 |
-
model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.bias"] = layer.mlp_ln_b;
|
| 1777 |
|
| 1778 |
-
|
| 1779 |
-
|
| 1780 |
|
| 1781 |
-
|
| 1782 |
-
model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.bias"] = layer.mlp_1_b;
|
| 1783 |
|
| 1784 |
-
|
| 1785 |
-
|
| 1786 |
|
| 1787 |
-
|
| 1788 |
-
|
| 1789 |
|
| 1790 |
-
|
|
|
|
| 1791 |
|
| 1792 |
-
|
| 1793 |
-
|
| 1794 |
|
| 1795 |
-
|
| 1796 |
-
model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.bias"] = layer.attn_ln_1_b;
|
| 1797 |
|
| 1798 |
-
|
| 1799 |
-
|
| 1800 |
|
| 1801 |
-
|
| 1802 |
-
|
| 1803 |
-
|
| 1804 |
-
model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.key.weight"] = layer.cross_attn_k_w;
|
| 1805 |
-
|
| 1806 |
-
model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.value.weight"] = layer.cross_attn_v_w;
|
| 1807 |
-
model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.value.bias"] = layer.cross_attn_v_b;
|
| 1808 |
-
|
| 1809 |
-
model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.weight"] = layer.cross_attn_ln_1_w;
|
| 1810 |
-
model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.bias"] = layer.cross_attn_ln_1_b;
|
| 1811 |
-
}
|
| 1812 |
}
|
|
|
|
|
|
|
| 1813 |
}
|
| 1814 |
|
| 1815 |
// allocate tensors in the backend buffers
|
| 1816 |
-
|
| 1817 |
-
|
| 1818 |
-
|
| 1819 |
-
|
| 1820 |
-
|
|
|
|
| 1821 |
|
| 1822 |
-
|
| 1823 |
-
|
|
|
|
|
|
|
| 1824 |
|
| 1825 |
// load weights
|
| 1826 |
{
|
|
@@ -1883,11 +1934,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 1883 |
return false;
|
| 1884 |
}
|
| 1885 |
|
| 1886 |
-
|
| 1887 |
-
|
| 1888 |
-
//printf("%s: [%5.5s] %s\n", __func__, ggml_backend_name(backend), name.c_str());
|
| 1889 |
-
|
| 1890 |
-
if (ggml_backend_buffer_is_host(model.buffer)) {
|
| 1891 |
// for the CPU and Metal backend, we can read directly into the tensor
|
| 1892 |
loader->read(loader->context, tensor->data, ggml_nbytes(tensor));
|
| 1893 |
BYTESWAP_TENSOR(tensor);
|
|
@@ -1900,7 +1947,6 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 1900 |
ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor));
|
| 1901 |
}
|
| 1902 |
|
| 1903 |
-
//printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype), ggml_nbytes(tensor)/1e6);
|
| 1904 |
total_size += ggml_nbytes(tensor);
|
| 1905 |
model.n_loaded++;
|
| 1906 |
}
|
|
@@ -1915,7 +1961,9 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 1915 |
}
|
| 1916 |
}
|
| 1917 |
|
| 1918 |
-
|
|
|
|
|
|
|
| 1919 |
|
| 1920 |
wctx.t_load_us = ggml_time_us() - t_start_us;
|
| 1921 |
|
|
@@ -3806,9 +3854,13 @@ void whisper_free_state(struct whisper_state * state) {
|
|
| 3806 |
|
| 3807 |
void whisper_free(struct whisper_context * ctx) {
|
| 3808 |
if (ctx) {
|
| 3809 |
-
|
|
|
|
|
|
|
| 3810 |
|
| 3811 |
-
|
|
|
|
|
|
|
| 3812 |
|
| 3813 |
whisper_free_state(ctx->state);
|
| 3814 |
|
|
|
|
| 1 |
#include "whisper.h"
|
| 2 |
+
#include "whisper-arch.h"
|
| 3 |
|
| 4 |
#include "ggml.h"
|
| 5 |
#include "ggml-cpp.h"
|
|
|
|
| 19 |
#include <cassert>
|
| 20 |
#define _USE_MATH_DEFINES
|
| 21 |
#include <cmath>
|
| 22 |
+
#include <climits>
|
| 23 |
#include <codecvt>
|
| 24 |
#include <cstdarg>
|
| 25 |
#include <cstdio>
|
|
|
|
| 145 |
#define WHISPER_MAX_DECODERS 8
|
| 146 |
#define WHISPER_MAX_NODES 4096
|
| 147 |
|
| 148 |
+
static std::string format(const char * fmt, ...) {
|
| 149 |
+
va_list ap;
|
| 150 |
+
va_list ap2;
|
| 151 |
+
va_start(ap, fmt);
|
| 152 |
+
va_copy(ap2, ap);
|
| 153 |
+
int size = vsnprintf(NULL, 0, fmt, ap);
|
| 154 |
+
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
|
| 155 |
+
std::vector<char> buf(size + 1);
|
| 156 |
+
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
| 157 |
+
GGML_ASSERT(size2 == size);
|
| 158 |
+
va_end(ap2);
|
| 159 |
+
va_end(ap);
|
| 160 |
+
return std::string(buf.data(), size);
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
//
|
| 164 |
// ggml helpers
|
| 165 |
//
|
|
|
|
| 795 |
std::vector<whisper_layer_decoder> layers_decoder;
|
| 796 |
|
| 797 |
// ggml context that contains all the meta information about the model tensors
|
| 798 |
+
std::vector<ggml_context *> ctxs;
|
| 799 |
|
| 800 |
// the model backend data is read-only and can be shared between processors
|
| 801 |
+
std::vector<ggml_backend_buffer_t> buffers;
|
| 802 |
|
| 803 |
// tensors
|
| 804 |
int n_loaded;
|
|
|
|
| 1381 |
return result;
|
| 1382 |
}
|
| 1383 |
|
| 1384 |
+
using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
|
|
|
|
| 1385 |
|
| 1386 |
+
static buft_list_t make_buft_list(whisper_context_params & params) {
|
| 1387 |
+
// Prio order: GPU -> CPU Extra -> CPU
|
| 1388 |
+
buft_list_t buft_list;
|
| 1389 |
|
| 1390 |
+
// GPU
|
| 1391 |
+
if (params.use_gpu) {
|
| 1392 |
+
int cnt = 0;
|
| 1393 |
+
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
| 1394 |
+
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
| 1395 |
+
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
|
| 1396 |
+
if (cnt == 0 || cnt == params.gpu_device) {
|
| 1397 |
+
auto * buft = ggml_backend_dev_buffer_type(dev);
|
| 1398 |
+
if (buft) {
|
| 1399 |
+
buft_list.emplace_back(dev, buft);
|
| 1400 |
+
}
|
| 1401 |
+
}
|
| 1402 |
+
|
| 1403 |
+
if (++cnt > params.gpu_device) {
|
| 1404 |
+
break;
|
| 1405 |
+
}
|
| 1406 |
}
|
| 1407 |
+
}
|
| 1408 |
+
}
|
| 1409 |
+
|
| 1410 |
+
// CPU Extra
|
| 1411 |
+
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
| 1412 |
+
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
| 1413 |
+
auto get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
| 1414 |
+
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
|
| 1415 |
+
if (get_extra_bufts_fn) {
|
| 1416 |
+
ggml_backend_buffer_type_t * extra_bufts = get_extra_bufts_fn(cpu_dev);
|
| 1417 |
+
while (extra_bufts && *extra_bufts) {
|
| 1418 |
+
buft_list.emplace_back(cpu_dev, *extra_bufts);
|
| 1419 |
+
++extra_bufts;
|
| 1420 |
+
}
|
| 1421 |
+
}
|
| 1422 |
+
|
| 1423 |
+
// CPU
|
| 1424 |
+
buft_list.emplace_back(cpu_dev, ggml_backend_cpu_buffer_type());
|
| 1425 |
+
|
| 1426 |
+
return buft_list;
|
| 1427 |
+
}
|
| 1428 |
+
|
| 1429 |
+
static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
|
| 1430 |
+
bool op_supported = true;
|
| 1431 |
+
|
| 1432 |
+
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU ||
|
| 1433 |
+
(ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU && buft == ggml_backend_cpu_buffer_type())) {
|
| 1434 |
+
// GPU and default CPU backend support all operators
|
| 1435 |
+
op_supported = true;
|
| 1436 |
+
} else {
|
| 1437 |
+
switch (op) {
|
| 1438 |
+
// The current extra_buffer_type implementations only support GGML_OP_MUL_MAT
|
| 1439 |
+
case GGML_OP_MUL_MAT: {
|
| 1440 |
+
ggml_init_params params = {
|
| 1441 |
+
/*.mem_size =*/ 2 * ggml_tensor_overhead(),
|
| 1442 |
+
/*.mem_buffer =*/ nullptr,
|
| 1443 |
+
/*.no_alloc =*/ true,
|
| 1444 |
+
};
|
| 1445 |
+
|
| 1446 |
+
ggml_context_ptr ctx_ptr { ggml_init(params) };
|
| 1447 |
+
if (!ctx_ptr) {
|
| 1448 |
+
throw std::runtime_error("failed to create ggml context");
|
| 1449 |
+
}
|
| 1450 |
+
ggml_context * ctx = ctx_ptr.get();
|
| 1451 |
|
| 1452 |
+
ggml_tensor * op_tensor = nullptr;
|
| 1453 |
+
|
| 1454 |
+
int64_t n_ctx = hparams.n_audio_ctx;
|
| 1455 |
+
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], n_ctx, w->ne[2], w->ne[3]);
|
| 1456 |
+
op_tensor = ggml_mul_mat(ctx, w, b);
|
| 1457 |
+
|
| 1458 |
+
// create a temporary dummy buffer for the weight so that supports_op can check the buffer type
|
| 1459 |
+
GGML_ASSERT(w->buffer == nullptr);
|
| 1460 |
+
w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
|
| 1461 |
+
op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
|
| 1462 |
+
ggml_backend_buffer_free(w->buffer);
|
| 1463 |
+
w->buffer = nullptr;
|
| 1464 |
+
break;
|
| 1465 |
+
}
|
| 1466 |
+
default: {
|
| 1467 |
+
op_supported = false;
|
| 1468 |
break;
|
| 1469 |
}
|
| 1470 |
+
};
|
| 1471 |
+
}
|
| 1472 |
+
|
| 1473 |
+
return op_supported;
|
| 1474 |
+
}
|
| 1475 |
+
|
| 1476 |
+
static ggml_backend_buffer_type_t select_weight_buft(const whisper_hparams & hparams, ggml_tensor * w, ggml_op op, buft_list_t buft_list) {
|
| 1477 |
+
GGML_ASSERT(!buft_list.empty());
|
| 1478 |
+
for (const auto & p : buft_list) {
|
| 1479 |
+
ggml_backend_dev_t dev = p.first;
|
| 1480 |
+
ggml_backend_buffer_type_t buft = p.second;
|
| 1481 |
+
if (weight_buft_supported(hparams, w, op, buft, dev)) {
|
| 1482 |
+
return buft;
|
| 1483 |
}
|
| 1484 |
}
|
| 1485 |
|
| 1486 |
+
return nullptr;
|
| 1487 |
}
|
| 1488 |
|
| 1489 |
// load the model from a ggml file
|
|
|
|
| 1692 |
const ggml_type wtype = wctx.wtype;
|
| 1693 |
const ggml_type vtype = wctx.wtype == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16; // conv type
|
| 1694 |
|
| 1695 |
+
const auto & hparams = model.hparams;
|
|
|
|
|
|
|
| 1696 |
|
| 1697 |
+
const int n_audio_layer = hparams.n_audio_layer;
|
| 1698 |
+
const int n_text_layer = hparams.n_text_layer;
|
| 1699 |
|
| 1700 |
+
const size_t n_tensors = 10 /* input */ + 15 + 15*n_audio_layer + 24*n_text_layer;
|
| 1701 |
|
| 1702 |
+
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
| 1703 |
+
auto get_ctx = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
|
| 1704 |
+
auto it = ctx_map.find(buft);
|
| 1705 |
+
if (it == ctx_map.end()) {
|
| 1706 |
+
ggml_init_params params = {
|
| 1707 |
+
/*.mem_size =*/ n_tensors * ggml_tensor_overhead(),
|
| 1708 |
+
/*.mem_buffer =*/ nullptr,
|
| 1709 |
+
/*.no_alloc =*/ true,
|
| 1710 |
+
};
|
| 1711 |
|
| 1712 |
+
ggml_context * ctx = ggml_init(params);
|
| 1713 |
+
if (!ctx) {
|
| 1714 |
+
throw std::runtime_error("failed to create ggml context");
|
| 1715 |
+
}
|
| 1716 |
+
|
| 1717 |
+
ctx_map[buft] = ctx;
|
| 1718 |
+
model.ctxs.emplace_back(ctx);
|
| 1719 |
+
|
| 1720 |
+
return ctx;
|
| 1721 |
}
|
| 1722 |
+
|
| 1723 |
+
return it->second;
|
| 1724 |
+
};
|
| 1725 |
+
|
| 1726 |
+
// Create a list of available bufts, in priority order
|
| 1727 |
+
buft_list_t buft_list = make_buft_list(wctx.params);
|
| 1728 |
+
|
| 1729 |
+
auto create_tensor = [&](asr_tensor type, asr_system system, ggml_tensor * meta, int layer = 0) -> ggml_tensor * {
|
| 1730 |
+
ggml_op op = ASR_TENSOR_INFO.at(type);
|
| 1731 |
+
ggml_backend_buffer_type_t buft = select_weight_buft(hparams, meta, op, buft_list);
|
| 1732 |
+
if (!buft) {
|
| 1733 |
+
throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", ASR_TENSOR_NAMES.at(system).at(type)));
|
| 1734 |
+
}
|
| 1735 |
+
|
| 1736 |
+
ggml_context * ctx = get_ctx(buft);
|
| 1737 |
+
ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
|
| 1738 |
+
|
| 1739 |
+
model.tensors[format(ASR_TENSOR_NAMES.at(system).at(type), layer)] = tensor;
|
| 1740 |
+
|
| 1741 |
+
return tensor;
|
| 1742 |
+
};
|
| 1743 |
+
|
| 1744 |
|
| 1745 |
// prepare tensors for the weights
|
| 1746 |
{
|
| 1747 |
+
ggml_init_params params = {
|
| 1748 |
+
/*.mem_size =*/ n_tensors * ggml_tensor_overhead(),
|
| 1749 |
+
/*.mem_buffer =*/ nullptr,
|
| 1750 |
+
/*.no_alloc =*/ true,
|
| 1751 |
+
};
|
| 1752 |
+
|
| 1753 |
+
ggml_context * ctx = ggml_init(params);
|
| 1754 |
|
| 1755 |
const auto & hparams = model.hparams;
|
| 1756 |
|
|
|
|
| 1770 |
model.layers_decoder.resize(n_text_layer);
|
| 1771 |
|
| 1772 |
// encoder
|
| 1773 |
+
model.e_pe = create_tensor(ASR_TENSOR_ENC_POS_EMBD, ASR_SYSTEM_ENCODER, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_state, n_audio_ctx));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1774 |
|
| 1775 |
+
model.e_conv_1_w = create_tensor(ASR_TENSOR_CONV1_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_3d(ctx, vtype, 3, n_mels, n_audio_state));
|
| 1776 |
+
model.e_conv_1_b = create_tensor(ASR_TENSOR_CONV1_BIAS, ASR_SYSTEM_ENCODER, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state));
|
| 1777 |
|
| 1778 |
+
model.e_conv_2_w = create_tensor(ASR_TENSOR_CONV2_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_3d(ctx, vtype, 3, n_audio_state, n_audio_state));
|
| 1779 |
+
model.e_conv_2_b = create_tensor(ASR_TENSOR_CONV2_BIAS, ASR_SYSTEM_ENCODER, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state));
|
| 1780 |
|
| 1781 |
+
model.e_ln_w = create_tensor(ASR_TENSOR_LN_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state));
|
| 1782 |
+
model.e_ln_b = create_tensor(ASR_TENSOR_LN_POST_BIAS, ASR_SYSTEM_ENCODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state));
|
| 1783 |
|
| 1784 |
+
for (int i = 0; i < n_audio_layer; ++i) {
|
| 1785 |
+
auto & layer = model.layers_encoder[i];
|
| 1786 |
|
| 1787 |
+
layer.mlp_ln_w = create_tensor(ASR_TENSOR_MLP_LN_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state), i);
|
| 1788 |
+
layer.mlp_ln_b = create_tensor(ASR_TENSOR_MLP_LN_BIAS, ASR_SYSTEM_ENCODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state), i);
|
| 1789 |
|
| 1790 |
+
layer.mlp_0_w = create_tensor(ASR_TENSOR_MLP_0_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_2d(ctx, wtype, n_audio_state, 4*n_audio_state), i);
|
| 1791 |
+
layer.mlp_0_b = create_tensor(ASR_TENSOR_MLP_0_BIAS, ASR_SYSTEM_ENCODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_audio_state), i);
|
| 1792 |
|
| 1793 |
+
layer.mlp_1_w = create_tensor(ASR_TENSOR_MLP_2_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_2d(ctx, wtype, 4*n_audio_state, n_audio_state), i);
|
| 1794 |
+
layer.mlp_1_b = create_tensor(ASR_TENSOR_MLP_2_BIAS, ASR_SYSTEM_ENCODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state), i);
|
| 1795 |
|
| 1796 |
+
layer.attn_ln_0_w = create_tensor(ASR_TENSOR_ATTN_LN_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state), i);
|
| 1797 |
+
layer.attn_ln_0_b = create_tensor(ASR_TENSOR_ATTN_LN_BIAS, ASR_SYSTEM_ENCODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state), i);
|
| 1798 |
|
| 1799 |
+
layer.attn_q_w = create_tensor(ASR_TENSOR_ATTN_QUERY_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state), i);
|
| 1800 |
+
layer.attn_q_b = create_tensor(ASR_TENSOR_ATTN_QUERY_BIAS, ASR_SYSTEM_ENCODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state), i);
|
| 1801 |
|
| 1802 |
+
layer.attn_k_w = create_tensor(ASR_TENSOR_ATTN_KEY_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state), i);
|
|
|
|
| 1803 |
|
| 1804 |
+
layer.attn_v_w = create_tensor(ASR_TENSOR_ATTN_VALUE_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state), i);
|
| 1805 |
+
layer.attn_v_b = create_tensor(ASR_TENSOR_ATTN_VALUE_BIAS, ASR_SYSTEM_ENCODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state), i);
|
|
|
|
| 1806 |
|
| 1807 |
+
layer.attn_ln_1_w = create_tensor(ASR_TENSOR_ATTN_OUT_WEIGHT, ASR_SYSTEM_ENCODER, ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state), i);
|
| 1808 |
+
layer.attn_ln_1_b = create_tensor(ASR_TENSOR_ATTN_OUT_BIAS, ASR_SYSTEM_ENCODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state), i);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1809 |
}
|
| 1810 |
|
| 1811 |
// decoder
|
| 1812 |
+
model.d_pe = create_tensor(ASR_TENSOR_DEC_POS_EMBD, ASR_SYSTEM_DECODER, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_text_state, n_text_ctx));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1813 |
|
| 1814 |
+
model.d_te = create_tensor(ASR_TENSOR_DEC_TOKEN_EMBD_WEIGHT, ASR_SYSTEM_DECODER, ggml_new_tensor_2d(ctx, wtype, n_text_state, n_vocab));
|
|
|
|
| 1815 |
|
| 1816 |
+
model.d_ln_w = create_tensor(ASR_TENSOR_LN_WEIGHT, ASR_SYSTEM_DECODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state));
|
| 1817 |
+
model.d_ln_b = create_tensor(ASR_TENSOR_LN_BIAS, ASR_SYSTEM_DECODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state));
|
| 1818 |
|
| 1819 |
+
for (int i = 0; i < n_text_layer; ++i) {
|
| 1820 |
+
auto & layer = model.layers_decoder[i];
|
| 1821 |
|
| 1822 |
+
layer.mlp_ln_w = create_tensor(ASR_TENSOR_MLP_LN_WEIGHT, ASR_SYSTEM_DECODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state), i);
|
| 1823 |
+
layer.mlp_ln_b = create_tensor(ASR_TENSOR_MLP_LN_BIAS, ASR_SYSTEM_DECODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state), i);
|
| 1824 |
|
| 1825 |
+
layer.mlp_0_w = create_tensor(ASR_TENSOR_MLP_0_WEIGHT, ASR_SYSTEM_DECODER, ggml_new_tensor_2d(ctx, wtype, n_text_state, 4*n_text_state), i);
|
| 1826 |
+
layer.mlp_0_b = create_tensor(ASR_TENSOR_MLP_0_BIAS, ASR_SYSTEM_DECODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_text_state), i);
|
| 1827 |
|
| 1828 |
+
layer.mlp_1_w = create_tensor(ASR_TENSOR_MLP_2_WEIGHT, ASR_SYSTEM_DECODER, ggml_new_tensor_2d(ctx, wtype, 4*n_text_state, n_text_state), i);
|
| 1829 |
+
layer.mlp_1_b = create_tensor(ASR_TENSOR_MLP_2_BIAS, ASR_SYSTEM_DECODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state), i);
|
| 1830 |
|
| 1831 |
+
layer.attn_ln_0_w = create_tensor(ASR_TENSOR_ATTN_LN_WEIGHT, ASR_SYSTEM_DECODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state), i);
|
| 1832 |
+
layer.attn_ln_0_b = create_tensor(ASR_TENSOR_ATTN_LN_BIAS, ASR_SYSTEM_DECODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state), i);
|
|
|
|
| 1833 |
|
| 1834 |
+
layer.attn_q_w = create_tensor(ASR_TENSOR_ATTN_QUERY_WEIGHT, ASR_SYSTEM_DECODER, ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state), i);
|
| 1835 |
+
layer.attn_q_b = create_tensor(ASR_TENSOR_ATTN_QUERY_BIAS, ASR_SYSTEM_DECODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state), i);
|
| 1836 |
|
| 1837 |
+
layer.attn_k_w = create_tensor(ASR_TENSOR_ATTN_KEY_WEIGHT, ASR_SYSTEM_DECODER, ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state), i);
|
|
|
|
| 1838 |
|
| 1839 |
+
layer.attn_v_w = create_tensor(ASR_TENSOR_ATTN_VALUE_WEIGHT, ASR_SYSTEM_DECODER, ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state), i);
|
| 1840 |
+
layer.attn_v_b = create_tensor(ASR_TENSOR_ATTN_VALUE_BIAS, ASR_SYSTEM_DECODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state), i);
|
| 1841 |
|
| 1842 |
+
layer.attn_ln_1_w = create_tensor(ASR_TENSOR_ATTN_OUT_WEIGHT, ASR_SYSTEM_DECODER, ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state), i);
|
| 1843 |
+
layer.attn_ln_1_b = create_tensor(ASR_TENSOR_ATTN_OUT_BIAS, ASR_SYSTEM_DECODER, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state), i);
|
| 1844 |
|
| 1845 |
+
layer.cross_attn_ln_0_w = create_tensor(ASR_TENSOR_ATTN_LN_WEIGHT, ASR_SYSTEM_CROSS, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state), i);
|
| 1846 |
+
layer.cross_attn_ln_0_b = create_tensor(ASR_TENSOR_ATTN_LN_BIAS, ASR_SYSTEM_CROSS, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state), i);
|
| 1847 |
|
| 1848 |
+
layer.cross_attn_q_w = create_tensor(ASR_TENSOR_ATTN_QUERY_WEIGHT, ASR_SYSTEM_CROSS, ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state), i);
|
| 1849 |
+
layer.cross_attn_q_b = create_tensor(ASR_TENSOR_ATTN_QUERY_BIAS, ASR_SYSTEM_CROSS, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state), i);
|
| 1850 |
|
| 1851 |
+
layer.cross_attn_k_w = create_tensor(ASR_TENSOR_ATTN_KEY_WEIGHT, ASR_SYSTEM_CROSS, ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state), i);
|
|
|
|
| 1852 |
|
| 1853 |
+
layer.cross_attn_v_w = create_tensor(ASR_TENSOR_ATTN_VALUE_WEIGHT, ASR_SYSTEM_CROSS, ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state), i);
|
| 1854 |
+
layer.cross_attn_v_b = create_tensor(ASR_TENSOR_ATTN_VALUE_BIAS, ASR_SYSTEM_CROSS, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state), i);
|
| 1855 |
|
| 1856 |
+
layer.cross_attn_ln_1_w = create_tensor(ASR_TENSOR_ATTN_OUT_WEIGHT, ASR_SYSTEM_CROSS, ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state), i);
|
| 1857 |
+
layer.cross_attn_ln_1_b = create_tensor(ASR_TENSOR_ATTN_OUT_BIAS, ASR_SYSTEM_CROSS, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state), i);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1858 |
}
|
| 1859 |
+
|
| 1860 |
+
ggml_free(ctx);
|
| 1861 |
}
|
| 1862 |
|
| 1863 |
// allocate tensors in the backend buffers
|
| 1864 |
+
for (auto & p : ctx_map) {
|
| 1865 |
+
ggml_backend_buffer_type_t buft = p.first;
|
| 1866 |
+
ggml_context * ctx = p.second;
|
| 1867 |
+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
| 1868 |
+
if (buf) {
|
| 1869 |
+
model.buffers.emplace_back(buf);
|
| 1870 |
|
| 1871 |
+
size_t size_main = ggml_backend_buffer_get_size(buf);
|
| 1872 |
+
WHISPER_LOG_INFO("%s: %12s total size = %8.2f MB\n", __func__, ggml_backend_buffer_name(buf), size_main / 1e6);
|
| 1873 |
+
}
|
| 1874 |
+
}
|
| 1875 |
|
| 1876 |
// load weights
|
| 1877 |
{
|
|
|
|
| 1934 |
return false;
|
| 1935 |
}
|
| 1936 |
|
| 1937 |
+
if (ggml_backend_buffer_is_host(tensor->buffer)) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1938 |
// for the CPU and Metal backend, we can read directly into the tensor
|
| 1939 |
loader->read(loader->context, tensor->data, ggml_nbytes(tensor));
|
| 1940 |
BYTESWAP_TENSOR(tensor);
|
|
|
|
| 1947 |
ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor));
|
| 1948 |
}
|
| 1949 |
|
|
|
|
| 1950 |
total_size += ggml_nbytes(tensor);
|
| 1951 |
model.n_loaded++;
|
| 1952 |
}
|
|
|
|
| 1961 |
}
|
| 1962 |
}
|
| 1963 |
|
| 1964 |
+
for (auto & buf : model.buffers) {
|
| 1965 |
+
ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
| 1966 |
+
}
|
| 1967 |
|
| 1968 |
wctx.t_load_us = ggml_time_us() - t_start_us;
|
| 1969 |
|
|
|
|
| 3854 |
|
| 3855 |
void whisper_free(struct whisper_context * ctx) {
|
| 3856 |
if (ctx) {
|
| 3857 |
+
for (ggml_context * context : ctx->model.ctxs) {
|
| 3858 |
+
ggml_free(context);
|
| 3859 |
+
}
|
| 3860 |
|
| 3861 |
+
for (ggml_backend_buffer_t buf : ctx->model.buffers) {
|
| 3862 |
+
ggml_backend_buffer_free(buf);
|
| 3863 |
+
}
|
| 3864 |
|
| 3865 |
whisper_free_state(ctx->state);
|
| 3866 |
|