Skip to content

Commit dc6a47a

Browse files
authored
feat: sync llama.cpp (#58)
* feat: sync llama.cpp * chore(example, ios): update lockfile
1 parent 7fbcebc commit dc6a47a

30 files changed

+17747
-8729
lines changed

cpp/common.cpp

+752-751
Large diffs are not rendered by default.

cpp/common.h

+51-42
Original file line numberDiff line numberDiff line change
@@ -27,23 +27,20 @@
2727
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
2828

2929
#define print_build_info() do { \
30-
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
30+
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
3131
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
3232
} while(0)
3333

3434
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
3535

3636
// build info
3737
extern int LLAMA_BUILD_NUMBER;
38-
extern char const *LLAMA_COMMIT;
39-
extern char const *LLAMA_COMPILER;
40-
extern char const *LLAMA_BUILD_TARGET;
38+
extern char const * LLAMA_COMMIT;
39+
extern char const * LLAMA_COMPILER;
40+
extern char const * LLAMA_BUILD_TARGET;
4141

4242
struct llama_control_vector_load_info;
4343

44-
int get_math_cpu_count();
45-
int32_t get_num_physical_cores();
46-
4744
#define print_build_info() do { \
4845
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
4946
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
@@ -55,14 +52,21 @@ extern char const *LLAMA_COMMIT;
5552
extern char const *LLAMA_COMPILER;
5653
extern char const *LLAMA_BUILD_TARGET;
5754

55+
//
56+
// CPU utils
57+
//
58+
59+
int32_t cpu_get_num_physical_cores();
60+
int32_t cpu_get_num_math();
61+
5862
//
5963
// CLI argument parsing
6064
//
6165

6266
struct gpt_params {
6367
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
6468

65-
int32_t n_threads = get_math_cpu_count();
69+
int32_t n_threads = cpu_get_num_math();
6670
int32_t n_threads_draft = -1;
6771
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
6872
int32_t n_threads_batch_draft = -1;
@@ -93,6 +97,7 @@ struct gpt_params {
9397
float yarn_beta_slow = 1.0f; // YaRN high correction dim
9498
int32_t yarn_orig_ctx = 0; // YaRN original context length
9599
float defrag_thold = -1.0f; // KV cache defragmentation threshold
100+
std::string rpc_servers = ""; // comma separated list of RPC servers
96101

97102
lm_ggml_backend_sched_eval_callback cb_eval = nullptr;
98103
void * cb_eval_user_data = nullptr;
@@ -151,6 +156,9 @@ struct gpt_params {
151156
bool random_prompt = false; // do not randomize prompt if none provided
152157
bool use_color = false; // use color to distinguish generations and inputs
153158
bool interactive = false; // interactive mode
159+
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
160+
bool special = false; // enable special token output
161+
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
154162
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
155163
bool prompt_cache_all = false; // save user input and generations to prompt cache
156164
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
@@ -187,33 +195,34 @@ struct gpt_params {
187195

188196
void gpt_params_handle_model_default(gpt_params & params);
189197

190-
bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
191-
192-
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
198+
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
199+
bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
200+
bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
201+
void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
193202

194-
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
203+
std::string gpt_params_get_system_info(const gpt_params & params);
195204

196-
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
197-
198-
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
199-
200-
std::string get_system_info(const gpt_params & params);
205+
//
206+
// String utils
207+
//
201208

202-
std::string gpt_random_prompt(std::mt19937 & rng);
209+
std::vector<std::string> string_split(std::string input, char separator);
203210

204-
void process_escapes(std::string& input);
211+
std::string string_strip(const std::string & str);
212+
std::string string_get_sortable_timestamp();
213+
std::string string_random_prompt(std::mt19937 & rng);
205214

206-
bool validate_file_name(const std::string & filename);
215+
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
216+
void string_process_escapes(std::string & input);
207217

208218
//
209-
// String utils
219+
// Filesystem utils
210220
//
211221

212-
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
213-
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
214-
std::vector<std::string> string_split(std::string input, char separator);
215-
std::string string_strip(const std::string & str);
216-
std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
222+
bool fs_validate_filename(const std::string & filename);
223+
bool fs_create_directory_with_parents(const std::string & path);
224+
225+
std::string fs_get_cache_directory();
217226

218227
//
219228
// Model utils
@@ -284,29 +293,15 @@ std::string llama_detokenize_bpe(
284293
// defaults to true when model type is SPM, otherwise false.
285294
bool llama_should_add_bos_token(const llama_model * model);
286295

287-
//
288-
// YAML utils
289-
//
290-
291-
bool create_directory_with_parents(const std::string & path);
292-
void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
293-
void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
294-
void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
295-
std::string get_sortable_timestamp();
296-
297-
void dump_non_result_info_yaml(
298-
FILE * stream, const gpt_params & params, const llama_context * lctx,
299-
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
300-
301296
//
302297
// KV cache utils
303298
//
304299

305300
// Dump the KV cache view with the number of sequences per cell.
306-
void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
301+
void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
307302

308303
// Dump the KV cache view showing individual sequences in each cell (long output).
309-
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
304+
void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
310305

311306
//
312307
// Embedding utils
@@ -340,6 +335,20 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
340335
//
341336
// Split utils
342337
//
338+
343339
static const char * const LLM_KV_SPLIT_NO = "split.no";
344340
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
345341
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
342+
343+
//
344+
// YAML utils
345+
//
346+
347+
void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
348+
void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
349+
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
350+
351+
void yaml_dump_non_result_info(
352+
FILE * stream, const gpt_params & params, const llama_context * lctx,
353+
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
354+

cpp/ggml-backend.c

+2-3
Original file line numberDiff line numberDiff line change
@@ -1182,9 +1182,9 @@ static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sch
11821182
static char * fmt_size(size_t size) {
11831183
static char buffer[128];
11841184
if (size >= 1024*1024) {
1185-
sprintf(buffer, "%zuM", size/1024/1024);
1185+
snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
11861186
} else {
1187-
sprintf(buffer, "%zuK", size/1024);
1187+
snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
11881188
}
11891189
return buffer;
11901190
}
@@ -1895,7 +1895,6 @@ void lm_ggml_backend_view_init(lm_ggml_backend_buffer_t buffer, struct lm_ggml_t
18951895

18961896
tensor->buffer = buffer;
18971897
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
1898-
tensor->backend = tensor->view_src->backend;
18991898
lm_ggml_backend_buffer_init_tensor(buffer, tensor);
19001899
}
19011900

cpp/ggml-common.h

-54
Original file line numberDiff line numberDiff line change
@@ -65,13 +65,8 @@ typedef sycl::half2 lm_ggml_half2;
6565
// QK = number of values after dequantization
6666
// QK_K = super-block size
6767

68-
#ifdef LM_GGML_QKK_64
69-
#define QK_K 64
70-
#define K_SCALE_SIZE 4
71-
#else
7268
#define QK_K 256
7369
#define K_SCALE_SIZE 12
74-
#endif // LM_GGML_QKK_64
7570

7671
#if defined(LM_GGML_COMMON_DECL_CUDA) || defined(LM_GGML_COMMON_DECL_HIP) || defined(LM_GGML_COMMON_DECL_SYCL)
7772
// QR = QK / number of values before dequantization
@@ -131,13 +126,8 @@ typedef sycl::half2 lm_ggml_half2;
131126
#define QI4_NL (QK4_NL / (4*QR4_NL))
132127
#define QR4_NL 2
133128

134-
#if QK_K == 64
135-
#define QI4_XS QI4_NL
136-
#define QR4_XS QR4_NL
137-
#else
138129
#define QI4_XS (QK_K / (4*QR4_XS))
139130
#define QR4_XS 8
140-
#endif
141131

142132
#endif // LM_GGML_COMMON_DECL_CUDA || LM_GGML_COMMON_DECL_HIP
143133

@@ -228,36 +218,18 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(lm_ggml_half) + QK_K/16 + QK_K/4, "
228218
// weight is represented as x = a * q
229219
// 16 blocks of 16 elements each
230220
// Effectively 3.4375 bits per weight
231-
#ifdef LM_GGML_QKK_64
232-
typedef struct {
233-
uint8_t hmask[QK_K/8]; // quants - high bit
234-
uint8_t qs[QK_K/4]; // quants - low 2 bits
235-
uint8_t scales[2];
236-
lm_ggml_half d; // super-block scale
237-
} block_q3_K;
238-
static_assert(sizeof(block_q3_K) == sizeof(lm_ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
239-
#else
240221
typedef struct {
241222
uint8_t hmask[QK_K/8]; // quants - high bit
242223
uint8_t qs[QK_K/4]; // quants - low 2 bits
243224
uint8_t scales[12]; // scales, quantized with 6 bits
244225
lm_ggml_half d; // super-block scale
245226
} block_q3_K;
246227
static_assert(sizeof(block_q3_K) == sizeof(lm_ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
247-
#endif
248228

249229
// 4-bit quantization
250230
// 8 blocks of 32 elements each
251231
// weight is represented as x = a * q + b
252232
// Effectively 4.5 bits per weight
253-
#ifdef LM_GGML_QKK_64
254-
typedef struct {
255-
lm_ggml_half d[2]; // super-block scales/mins
256-
uint8_t scales[2]; // 4-bit block scales/mins
257-
uint8_t qs[QK_K/2]; // 4--bit quants
258-
} block_q4_K;
259-
static_assert(sizeof(block_q4_K) == 2*sizeof(lm_ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding");
260-
#else
261233
typedef struct {
262234
union {
263235
struct {
@@ -270,21 +242,11 @@ typedef struct {
270242
uint8_t qs[QK_K/2]; // 4--bit quants
271243
} block_q4_K;
272244
static_assert(sizeof(block_q4_K) == 2*sizeof(lm_ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
273-
#endif
274245

275246
// 5-bit quantization
276247
// 8 blocks of 32 elements each
277248
// weight is represented as x = a * q + b
278249
// Effectively 5.5 bits per weight
279-
#ifdef LM_GGML_QKK_64
280-
typedef struct {
281-
lm_ggml_half d; // super-block scale
282-
int8_t scales[QK_K/16]; // 8-bit block scales
283-
uint8_t qh[QK_K/8]; // quants, high bit
284-
uint8_t qs[QK_K/2]; // quants, low 4 bits
285-
} block_q5_K;
286-
static_assert(sizeof(block_q5_K) == sizeof(lm_ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
287-
#else
288250
typedef struct {
289251
union {
290252
struct {
@@ -298,7 +260,6 @@ typedef struct {
298260
uint8_t qs[QK_K/2]; // quants, low 4 bits
299261
} block_q5_K;
300262
static_assert(sizeof(block_q5_K) == 2*sizeof(lm_ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
301-
#endif
302263

303264
// 6-bit quantization
304265
// weight is represented as x = a * q
@@ -356,11 +317,7 @@ typedef struct {
356317
static_assert(sizeof(block_iq3_xxs) == sizeof(lm_ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
357318

358319
// 3.4375 bpw
359-
#if QK_K == 64
360-
#define IQ3S_N_SCALE 2
361-
#else
362320
#define IQ3S_N_SCALE QK_K/64
363-
#endif
364321
typedef struct {
365322
lm_ggml_half d;
366323
uint8_t qs[QK_K/4];
@@ -381,16 +338,9 @@ static_assert(sizeof(block_iq1_s) == sizeof(lm_ggml_half) + QK_K/8 + QK_K/16, "w
381338
typedef struct {
382339
uint8_t qs[QK_K/8]; // grid index, low 8 bits
383340
uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
384-
#if QK_K == 64
385-
lm_ggml_half d;
386-
#endif
387341
uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
388342
} block_iq1_m;
389-
#if QK_K == 64
390-
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(lm_ggml_half), "wrong iq1_m block size/padding");
391-
#else
392343
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
393-
#endif
394344

395345
// Used by IQ1_M quants
396346
typedef union {
@@ -406,17 +356,13 @@ typedef struct {
406356
} block_iq4_nl;
407357
static_assert(sizeof(block_iq4_nl) == sizeof(lm_ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");
408358

409-
#if QK_K == 64
410-
#define block_iq4_xs block_iq4_nl
411-
#else
412359
typedef struct {
413360
lm_ggml_half d;
414361
uint16_t scales_h;
415362
uint8_t scales_l[QK_K/64];
416363
uint8_t qs[QK_K/2];
417364
} block_iq4_xs;
418365
static_assert(sizeof(block_iq4_xs) == sizeof(lm_ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
419-
#endif
420366

421367
#endif // LM_GGML_COMMON_DECL
422368
#endif // LM_GGML_COMMON_DECL

0 commit comments

Comments
 (0)