-
Notifications
You must be signed in to change notification settings - Fork 13.4k
Allow specifying p scale factor for ggml rope and rope_back ops #1967
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
bc17e11
4bf45a7
887694a
e92795f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6603,6 +6603,7 @@ struct ggml_tensor * ggml_rope_impl( | |
int n_past, | ||
int n_dims, | ||
int mode, | ||
float p_scale, | ||
bool inplace) { | ||
GGML_ASSERT(n_past >= 0); | ||
bool is_node = false; | ||
|
@@ -6615,11 +6616,13 @@ struct ggml_tensor * ggml_rope_impl( | |
|
||
ggml_scratch_save(ctx); | ||
|
||
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3); | ||
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4); | ||
ggml_set_name(b, "n_past, n_dims, mode, p_scale"); | ||
|
||
((int32_t *) b->data)[0] = n_past; | ||
((int32_t *) b->data)[1] = n_dims; | ||
((int32_t *) b->data)[2] = mode; | ||
((float *) b->data)[0] = (float)n_past; | ||
((float *) b->data)[1] = (float)n_dims; | ||
((float *) b->data)[2] = (float)mode; | ||
((float *) b->data)[3] = p_scale; | ||
|
||
ggml_scratch_load(ctx); | ||
|
||
|
@@ -6637,7 +6640,7 @@ struct ggml_tensor * ggml_rope( | |
int n_past, | ||
int n_dims, | ||
int mode) { | ||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, false); | ||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 1.0, false); | ||
} | ||
|
||
struct ggml_tensor * ggml_rope_inplace( | ||
|
@@ -6646,17 +6649,39 @@ struct ggml_tensor * ggml_rope_inplace( | |
int n_past, | ||
int n_dims, | ||
int mode) { | ||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, true); | ||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 1.0, true); | ||
} | ||
|
||
struct ggml_tensor * ggml_rope_scaled( | ||
struct ggml_context * ctx, | ||
struct ggml_tensor * a, | ||
int n_past, | ||
int n_dims, | ||
int mode, | ||
float p_scale) { | ||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, p_scale, false); | ||
} | ||
|
||
struct ggml_tensor * ggml_rope_scaled_inplace( | ||
struct ggml_context * ctx, | ||
struct ggml_tensor * a, | ||
int n_past, | ||
int n_dims, | ||
int mode, | ||
float p_scale) { | ||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, p_scale, true); | ||
} | ||
|
||
|
||
// ggml_rope_back | ||
|
||
struct ggml_tensor * ggml_rope_back( | ||
struct ggml_tensor * ggml_rope_back_impl( | ||
struct ggml_context * ctx, | ||
struct ggml_tensor * a, | ||
int n_past, | ||
int n_dims, | ||
int mode) { | ||
int mode, | ||
float p_scale) { | ||
GGML_ASSERT(n_past >= 0); | ||
bool is_node = false; | ||
|
||
|
@@ -6668,12 +6693,13 @@ struct ggml_tensor * ggml_rope_back( | |
|
||
ggml_scratch_save(ctx); | ||
|
||
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3); | ||
ggml_set_name(b, "n_past, n_dims, mode"); | ||
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4); | ||
ggml_set_name(b, "n_past, n_dims, mode, p_scale"); | ||
|
||
((int32_t *) b->data)[0] = n_past; | ||
((int32_t *) b->data)[1] = n_dims; | ||
((int32_t *) b->data)[2] = mode; | ||
((float *) b->data)[0] = (float)n_past; | ||
((float *) b->data)[1] = (float)n_dims; | ||
((float *) b->data)[2] = (float)mode; | ||
((float *) b->data)[3] = p_scale; | ||
Comment on lines
+6699
to
+6702
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use |
||
|
||
ggml_scratch_load(ctx); | ||
|
||
|
@@ -6685,6 +6711,26 @@ struct ggml_tensor * ggml_rope_back( | |
return result; | ||
} | ||
|
||
struct ggml_tensor * ggml_rope_back( | ||
struct ggml_context * ctx, | ||
struct ggml_tensor * a, | ||
int n_past, | ||
int n_dims, | ||
int mode) { | ||
return ggml_rope_back_impl(ctx, a, n_past, n_dims, mode, 1.0); | ||
} | ||
|
||
struct ggml_tensor * ggml_rope_back_scaled( | ||
struct ggml_context * ctx, | ||
struct ggml_tensor * a, | ||
int n_past, | ||
int n_dims, | ||
int mode, | ||
float p_scale) { | ||
return ggml_rope_back_impl(ctx, a, n_past, n_dims, mode, p_scale); | ||
} | ||
|
||
|
||
// ggml_alibi | ||
|
||
struct ggml_tensor * ggml_alibi( | ||
|
@@ -12110,16 +12156,17 @@ static void ggml_compute_forward_rope_f32( | |
const struct ggml_tensor * src0, | ||
const struct ggml_tensor * src1, | ||
struct ggml_tensor * dst) { | ||
GGML_ASSERT(src1->type == GGML_TYPE_I32); | ||
GGML_ASSERT(ggml_nelements(src1) == 3); | ||
GGML_ASSERT(src1->type == GGML_TYPE_F32); | ||
GGML_ASSERT(ggml_nelements(src1) == 4); | ||
|
||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { | ||
return; | ||
} | ||
|
||
const int n_past = ((int32_t *) src1->data)[0]; | ||
const int n_dims = ((int32_t *) src1->data)[1]; | ||
const int mode = ((int32_t *) src1->data)[2]; | ||
const int n_past = (int)((float *) src1->data)[0]; | ||
const int n_dims = (int)((float *) src1->data)[1]; | ||
const int mode = (int)((float *) src1->data)[2]; | ||
const float p_scale = ((float *) src1->data)[3]; | ||
|
||
assert(n_past >= 0); | ||
|
||
|
@@ -12172,7 +12219,7 @@ static void ggml_compute_forward_rope_f32( | |
if (ir++ < ir0) continue; | ||
if (ir > ir1) break; | ||
|
||
float theta = (float)p; | ||
float theta = p_scale * (float)p; | ||
|
||
if (!is_neox) { | ||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) { | ||
|
@@ -12223,16 +12270,17 @@ static void ggml_compute_forward_rope_f16( | |
const struct ggml_tensor * src0, | ||
const struct ggml_tensor * src1, | ||
struct ggml_tensor * dst) { | ||
GGML_ASSERT(src1->type == GGML_TYPE_I32); | ||
GGML_ASSERT(ggml_nelements(src1) == 3); | ||
GGML_ASSERT(src1->type == GGML_TYPE_F32); | ||
GGML_ASSERT(ggml_nelements(src1) == 4); | ||
|
||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { | ||
return; | ||
} | ||
|
||
const int n_past = ((int32_t *) src1->data)[0]; | ||
const int n_dims = ((int32_t *) src1->data)[1]; | ||
const int mode = ((int32_t *) src1->data)[2]; | ||
const int n_past = (int)((float *) src1->data)[0]; | ||
const int n_dims = (int)((float *) src1->data)[1]; | ||
const int mode = (int)((float *) src1->data)[2]; | ||
const float p_scale = ((float *) src1->data)[3]; | ||
|
||
assert(n_past >= 0); | ||
|
||
|
@@ -12285,7 +12333,7 @@ static void ggml_compute_forward_rope_f16( | |
if (ir++ < ir0) continue; | ||
if (ir > ir1) break; | ||
|
||
float theta = (float)p; | ||
float theta = p_scale * (float)p; | ||
|
||
if (!is_neox) { | ||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) { | ||
|
@@ -12359,8 +12407,8 @@ static void ggml_compute_forward_rope_back_f32( | |
const struct ggml_tensor * src0, | ||
const struct ggml_tensor * src1, | ||
struct ggml_tensor * dst) { | ||
assert(src1->type == GGML_TYPE_I32); | ||
assert(ggml_nelements(src1) == 3); | ||
assert(src1->type == GGML_TYPE_F32); | ||
assert(ggml_nelements(src1) == 4); | ||
|
||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { | ||
return; | ||
|
@@ -12370,9 +12418,10 @@ static void ggml_compute_forward_rope_back_f32( | |
// dx = rope_back(dy, src1) | ||
// src0 is dy, src1 contains options | ||
|
||
const int n_past = ((int32_t *) src1->data)[0]; | ||
const int n_dims = ((int32_t *) src1->data)[1]; | ||
const int mode = ((int32_t *) src1->data)[2]; | ||
const int n_past = (int)((float *) src1->data)[0]; | ||
const int n_dims = (int)((float *) src1->data)[1]; | ||
const int mode = (int)((float *) src1->data)[2]; | ||
const float p_scale = ((float *) src1->data)[3]; | ||
|
||
assert(n_past >= 0); | ||
|
||
|
@@ -12423,7 +12472,7 @@ static void ggml_compute_forward_rope_back_f32( | |
if (ir++ < ir0) continue; | ||
if (ir > ir1) break; | ||
|
||
float theta = (float)p; | ||
float theta = p_scale * (float)p; | ||
|
||
if (!is_neox) { | ||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) { | ||
|
@@ -12472,8 +12521,8 @@ static void ggml_compute_forward_rope_back_f16( | |
const struct ggml_tensor * src0, | ||
const struct ggml_tensor * src1, | ||
struct ggml_tensor * dst) { | ||
assert(src1->type == GGML_TYPE_I32); | ||
assert(ggml_nelements(src1) == 3); | ||
assert(src1->type == GGML_TYPE_F32); | ||
assert(ggml_nelements(src1) == 4); | ||
|
||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { | ||
return; | ||
|
@@ -12483,9 +12532,10 @@ static void ggml_compute_forward_rope_back_f16( | |
// dx = rope_back(dy, src1) | ||
// src0 is dy, src1 contains options | ||
|
||
const int n_past = ((int32_t *) src1->data)[0]; | ||
const int n_dims = ((int32_t *) src1->data)[1]; | ||
const int mode = ((int32_t *) src1->data)[2]; | ||
const int n_past = (int)((float *) src1->data)[0]; | ||
const int n_dims = (int)((float *) src1->data)[1]; | ||
const int mode = (int)((float *) src1->data)[2]; | ||
const float p_scale = ((float *) src1->data)[3]; | ||
|
||
assert(n_past >= 0); | ||
|
||
|
@@ -12536,7 +12586,7 @@ static void ggml_compute_forward_rope_back_f16( | |
if (ir++ < ir0) continue; | ||
if (ir > ir1) break; | ||
|
||
float theta = (float)p; | ||
float theta = p_scale * (float)p; | ||
|
||
if (!is_neox) { | ||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) { | ||
|
@@ -15713,18 +15763,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor | |
{ | ||
// necessary for llama | ||
if (src0->grad) { | ||
assert(src1->type == GGML_TYPE_I32); | ||
assert(ggml_nelements(src1) == 3); | ||
const int n_past = ((int32_t *) src1->data)[0]; | ||
const int n_dims = ((int32_t *) src1->data)[1]; | ||
const int mode = ((int32_t *) src1->data)[2]; | ||
assert(src1->type == GGML_TYPE_F32); | ||
assert(ggml_nelements(src1) == 4); | ||
const int n_past = (int)((float *) src1->data)[0]; | ||
const int n_dims = (int)((float *) src1->data)[1]; | ||
const int mode = (int)((float *) src1->data)[2]; | ||
const float p_scale = ((float *) src1->data)[3]; | ||
src0->grad = ggml_add_impl(ctx, | ||
src0->grad, | ||
ggml_rope_back(ctx, | ||
ggml_rope_back_scaled(ctx, | ||
tensor->grad, | ||
n_past, | ||
n_dims, | ||
mode), | ||
mode, | ||
p_scale), | ||
inplace); | ||
} | ||
if (src1->grad) { | ||
|
@@ -15734,18 +15786,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor | |
case GGML_OP_ROPE_BACK: | ||
{ | ||
if (src0->grad) { | ||
assert(src1->type == GGML_TYPE_I32); | ||
assert(ggml_nelements(src1) == 3); | ||
const int n_past = ((int32_t *) src1->data)[0]; | ||
const int n_dims = ((int32_t *) src1->data)[1]; | ||
const int mode = ((int32_t *) src1->data)[2]; | ||
assert(src1->type == GGML_TYPE_F32); | ||
assert(ggml_nelements(src1) == 4); | ||
const int n_past = (int)((float *) src1->data)[0]; | ||
const int n_dims = (int)((float *) src1->data)[1]; | ||
const int mode = (int)((float *) src1->data)[2]; | ||
const float p_scale = ((float *) src1->data)[3]; | ||
src0->grad = ggml_add_impl(ctx, | ||
src0->grad, | ||
ggml_rope(ctx, | ||
ggml_rope_scaled(ctx, | ||
tensor->grad, | ||
n_past, | ||
n_dims, | ||
mode), | ||
mode, | ||
p_scale), | ||
inplace); | ||
} | ||
if (src1->grad) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1044,6 +1044,24 @@ extern "C" { | |
int n_dims, | ||
int mode); | ||
|
||
// same as ggml_rope but allows specifying p scale factor | ||
GGML_API struct ggml_tensor * ggml_rope_scaled( | ||
struct ggml_context * ctx, | ||
struct ggml_tensor * a, | ||
int n_past, | ||
int n_dims, | ||
int mode, | ||
float p_scale); | ||
|
||
// same as ggml_rope_inplace but allows specifying p scale factor | ||
GGML_API struct ggml_tensor * ggml_rope_scaled_inplace( | ||
struct ggml_context * ctx, | ||
struct ggml_tensor * a, | ||
int n_past, | ||
int n_dims, | ||
int mode, | ||
float p_scale); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No need to extend the API - add There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Won't this break every single thing that currently uses the llama.cpp version of GGML? What do you think about using a define to enable the p_scale argument for rope and having it be off by default? That way existing stuff can opt in. It might also be worth thinking about adding something like |
||
|
||
// rotary position embedding backward, i.e compute dx from dy | ||
// a - dy | ||
GGML_API struct ggml_tensor * ggml_rope_back( | ||
|
@@ -1053,6 +1071,15 @@ extern "C" { | |
int n_dims, | ||
int mode); | ||
|
||
// same as ggml_rope_back but allows specifying p scale factor | ||
GGML_API struct ggml_tensor * ggml_rope_back_scaled( | ||
struct ggml_context * ctx, | ||
struct ggml_tensor * a, | ||
int n_past, | ||
int n_dims, | ||
int mode, | ||
float p_scale); | ||
|
||
// alibi position embedding | ||
// in-place, returns view(a) | ||
struct ggml_tensor * ggml_alibi( | ||
|
Uh oh!
There was an error while loading. Please reload this page.