Skip to content

Commit d5f13ee

Browse files
committed
fix: substr - correct behaivour with negative start pos
1 parent 6ec18bb commit d5f13ee

File tree

2 files changed

+94
-4
lines changed

2 files changed

+94
-4
lines changed

datafusion/src/physical_plan/functions.rs

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3582,6 +3582,30 @@ mod tests {
35823582
StringArray
35833583
);
35843584
#[cfg(feature = "unicode_expressions")]
3585+
test_function!(
3586+
Substr,
3587+
&[
3588+
lit(ScalarValue::Utf8(Some("joséésoj".to_string()))),
3589+
lit(ScalarValue::Int64(Some(0))),
3590+
],
3591+
Ok(Some("joséésoj")),
3592+
&str,
3593+
Utf8,
3594+
StringArray
3595+
);
3596+
#[cfg(feature = "unicode_expressions")]
3597+
test_function!(
3598+
Substr,
3599+
&[
3600+
lit(ScalarValue::Utf8(Some("joséésoj".to_string()))),
3601+
lit(ScalarValue::Int64(Some(-5))),
3602+
],
3603+
Ok(Some("joséésoj")),
3604+
&str,
3605+
Utf8,
3606+
StringArray
3607+
);
3608+
#[cfg(feature = "unicode_expressions")]
35853609
test_function!(
35863610
Substr,
35873611
&[
@@ -3680,6 +3704,60 @@ mod tests {
36803704
StringArray
36813705
);
36823706
#[cfg(feature = "unicode_expressions")]
3707+
test_function!(
3708+
Substr,
3709+
&[
3710+
lit(ScalarValue::Utf8(Some("alphabet".to_string()))),
3711+
lit(ScalarValue::Int64(Some(0))),
3712+
lit(ScalarValue::Int64(Some(5))),
3713+
],
3714+
Ok(Some("alpha")),
3715+
&str,
3716+
Utf8,
3717+
StringArray
3718+
);
3719+
#[cfg(feature = "unicode_expressions")]
3720+
test_function!(
3721+
Substr,
3722+
&[
3723+
lit(ScalarValue::Utf8(Some("alphabet".to_string()))),
3724+
lit(ScalarValue::Int64(Some(-5))),
3725+
lit(ScalarValue::Int64(Some(10))),
3726+
],
3727+
Ok(Some("alpha")),
3728+
&str,
3729+
Utf8,
3730+
StringArray
3731+
);
3732+
// starting from -1 (4 + -5)
3733+
#[cfg(feature = "unicode_expressions")]
3734+
test_function!(
3735+
Substr,
3736+
&[
3737+
lit(ScalarValue::Utf8(Some("alphabet".to_string()))),
3738+
lit(ScalarValue::Int64(Some(-5))),
3739+
lit(ScalarValue::Int64(Some(4))),
3740+
],
3741+
Ok(Some("")),
3742+
&str,
3743+
Utf8,
3744+
StringArray
3745+
);
3746+
// starting from 0 (5 + -5)
3747+
#[cfg(feature = "unicode_expressions")]
3748+
test_function!(
3749+
Substr,
3750+
&[
3751+
lit(ScalarValue::Utf8(Some("alphabet".to_string()))),
3752+
lit(ScalarValue::Int64(Some(-5))),
3753+
lit(ScalarValue::Int64(Some(5))),
3754+
],
3755+
Ok(Some("")),
3756+
&str,
3757+
Utf8,
3758+
StringArray
3759+
);
3760+
#[cfg(feature = "unicode_expressions")]
36833761
test_function!(
36843762
Substr,
36853763
&[

datafusion/src/physical_plan/unicode_expressions.rs

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -455,12 +455,24 @@ pub fn substr<T: StringOffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
455455
Err(DataFusionError::Execution(
456456
"negative substring length not allowed".to_string(),
457457
))
458-
} else if start <= 0 {
459-
Ok(Some(string.to_string()))
460458
} else {
461459
let graphemes = string.graphemes(true).collect::<Vec<&str>>();
462-
let start_pos = start as usize - 1;
463-
let count_usize = count as usize;
460+
let start_pos = if start > 0 {
461+
start as usize - 1
462+
} else {
463+
0
464+
};
465+
let count_usize = if start < 0 {
466+
let count_size = count + start;
467+
if count_size <= 0 {
468+
return Ok(Some("".to_string()))
469+
} else {
470+
count_size as usize
471+
}
472+
} else {
473+
count as usize
474+
};
475+
464476
if graphemes.len() < start_pos {
465477
Ok(Some("".to_string()))
466478
} else if graphemes.len() < start_pos + count_usize {

0 commit comments

Comments
 (0)