Skip to content

Commit 0025eec

Browse files
authored
feat: sync llama.cpp (#54)
1 parent e6039bd commit 0025eec

23 files changed

+2608
-982
lines changed

cpp/common.cpp

+208-169
Large diffs are not rendered by default.

cpp/common.h

+13-4
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
3232
} while(0)
3333

34+
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
35+
3436
// build info
3537
extern int LLAMA_BUILD_NUMBER;
3638
extern char const *LLAMA_COMMIT;
@@ -103,7 +105,7 @@ struct gpt_params {
103105
// // sampling parameters
104106
struct llama_sampling_params sparams;
105107

106-
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
108+
std::string model = ""; // model path
107109
std::string model_draft = ""; // draft model for speculative decoding
108110
std::string model_alias = "unknown"; // model alias
109111
std::string model_url = ""; // model url to download
@@ -144,7 +146,7 @@ struct gpt_params {
144146
bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
145147
size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
146148

147-
bool kl_divergence = false; // compute KL-divergence
149+
bool kl_divergence = false; // compute KL divergence
148150

149151
bool random_prompt = false; // do not randomize prompt if none provided
150152
bool use_color = false; // use color to distinguish generations and inputs
@@ -159,6 +161,7 @@ struct gpt_params {
159161
bool multiline_input = false; // reverse the usage of `\`
160162
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
161163
bool cont_batching = true; // insert new sequences for decoding on-the-fly
164+
bool flash_attn = false; // flash attention
162165

163166
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
164167
bool ignore_eos = false; // ignore generated EOS tokens
@@ -172,15 +175,20 @@ struct gpt_params {
172175
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
173176
bool no_kv_offload = false; // disable KV offloading
174177
bool warmup = true; // warmup run
178+
bool check_tensors = false; // validate tensor data
175179

176180
std::string cache_type_k = "f16"; // KV cache data type for the K
177181
std::string cache_type_v = "f16"; // KV cache data type for the V
178182

179183
// multimodal models (see examples/llava)
180-
std::string mmproj = ""; // path to multimodal projector
181-
std::string image = ""; // path to an image file
184+
std::string mmproj = ""; // path to multimodal projector
185+
std::vector<std::string> image; // path to image file(s)
182186
};
183187

188+
void gpt_params_handle_model_default(gpt_params & params);
189+
190+
bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
191+
184192
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
185193

186194
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
@@ -204,6 +212,7 @@ bool validate_file_name(const std::string & filename);
204212
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
205213
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
206214
std::vector<std::string> string_split(std::string input, char separator);
215+
std::string string_strip(const std::string & str);
207216
std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
208217

209218
//

cpp/ggml-backend.c

+7-5
Original file line numberDiff line numberDiff line change
@@ -1784,12 +1784,14 @@ void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched) {
17841784

17851785
void lm_ggml_backend_sched_reset(lm_ggml_backend_sched_t sched) {
17861786
// reset state for the next run
1787-
size_t hash_size = sched->hash_set.size;
1788-
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
1789-
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
1790-
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
1787+
if (!sched->is_reset) {
1788+
size_t hash_size = sched->hash_set.size;
1789+
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
1790+
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
1791+
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
17911792

1792-
sched->is_reset = true;
1793+
sched->is_reset = true;
1794+
}
17931795
sched->is_alloc = false;
17941796
}
17951797

cpp/ggml-impl.h

+7-1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,12 @@
1111
#include <string.h> // memcpy
1212
#include <math.h> // fabsf
1313

14+
#undef MIN
15+
#undef MAX
16+
17+
#define MIN(a, b) ((a) < (b) ? (a) : (b))
18+
#define MAX(a, b) ((a) > (b) ? (a) : (b))
19+
1420
#ifdef __cplusplus
1521
extern "C" {
1622
#endif
@@ -307,7 +313,7 @@ inline static int32x4_t lm_ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t
307313

308314
#endif // defined(__ARM_NEON)
309315

310-
#if defined(__ARM_NEON) && !defined(__MSC_VER)
316+
#if defined(__ARM_NEON) && !defined(_MSC_VER)
311317

312318
#define LM_GGML_COMPUTE_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x)
313319
#define LM_GGML_COMPUTE_FP32_TO_FP16(x) lm_ggml_compute_fp32_to_fp16(x)

0 commit comments

Comments
 (0)