Skip to content

Commit eb500b7

Browse files
authored
fix: ensure nlp/sentencize handles punctuation in quotation marks properly
PR-URL: #5381 Closes: #3017 Closes: stdlib-js/metr-issue-tracker#2 Reviewed-by: Athan Reines <[email protected]>
1 parent 586bc1f commit eb500b7

File tree

2 files changed

+69
-6
lines changed

2 files changed

+69
-6
lines changed

lib/node_modules/@stdlib/nlp/sentencize/lib/main.js

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,13 @@ var trim = require( '@stdlib/string/base/trim' );
2727

2828
// VARIABLES //
2929

30+
var RE_LOWERCASE = /^[a-z]+$/;
3031
var RE_CAPITALIZED = /^[A-Z][a-z]{0,4}$/;
3132
var RE_CAPITALIZED_PERIOD = /^([A-Z]\.)*[A-Z]$/;
3233
var RE_NUMBER = /^[0-9]$/;
3334
var RE_PREFIXES = /^[{[(<:;"'`]/;
3435
var RE_SUFFIXES = /[})\]>:;"'`]$/;
36+
var RE_QUOTES = /^["'`]$/;
3537

3638

3739
// FUNCTIONS //
@@ -51,10 +53,38 @@ var RE_SUFFIXES = /[})\]>:;"'”`]$/;
5153
* @returns {boolean} boolean indicating whether the token at a specified index is an end-of-sentence token
5254
*/
5355
function isEndOfSentence( tokens, i ) {
56+
var nextToken;
5457
var token;
5558
var im1 = i - 1;
5659
var ip1 = i + 1;
60+
5761
token = tokens[ i ];
62+
63+
// Handle quoted text with punctuation...
64+
if (
65+
RE_QUOTES.test( token ) &&
66+
i > 0 &&
67+
( tokens[ im1 ] === '.' || tokens[ im1 ] === '!' || tokens[ im1 ] === '?' )
68+
) {
69+
// Look ahead to see if sentence continues:
70+
ip1 = i + 1;
71+
if ( ip1 < tokens.length ) {
72+
// Skip spaces...
73+
while ( ip1 < tokens.length && tokens[ ip1 ] === ' ' ) {
74+
ip1 += 1;
75+
}
76+
// If next non-space token is lowercase, we assume the sentence continues:
77+
if ( ip1 < tokens.length ) {
78+
nextToken = tokens[ ip1 ];
79+
if ( RE_LOWERCASE.test( nextToken ) ) {
80+
return false;
81+
}
82+
}
83+
}
84+
return true;
85+
}
86+
87+
// Regular sentence ending punctuation...
5888
if (
5989
token === '.' &&
6090
!RE_CAPITALIZED.test( tokens[ im1 ] ) && // for other short abbreviations and bullet points
@@ -73,12 +103,6 @@ function isEndOfSentence( tokens, i ) {
73103
) {
74104
return true;
75105
}
76-
if (
77-
RE_SUFFIXES.test( token ) &&
78-
( tokens[ im1 ] === '.' || tokens[ im1 ] === '!' || tokens[ im1 ] === '?' )
79-
) {
80-
return true;
81-
}
82106
return false;
83107
}
84108

@@ -112,6 +136,7 @@ function sentencize( str ) {
112136
var tokens;
113137
var out;
114138
var i;
139+
115140
if ( !isString( str ) ) {
116141
throw new TypeError( 'invalid argument. Must provide a string. Value: `' + str + '`.' );
117142
}

lib/node_modules/@stdlib/nlp/sentencize/test/test.js

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,3 +317,41 @@ tape( 'the function returns an empty array if provided an empty string', functio
317317
t.equal( out.length, 0, 'array length is zero' );
318318
t.end();
319319
});
320+
321+
tape( 'the function correctly handles punctuation within quotation marks', function test( t ) {
322+
var expected;
323+
var actual;
324+
var str;
325+
326+
str = 'I said "Look out" right before he banged his head.';
327+
expected = [ 'I said "Look out" right before he banged his head.' ];
328+
actual = sentencize( str );
329+
t.deepEqual( actual, expected, 'keeps sentence with simple quotes together' );
330+
331+
str = 'I said "Look out!" right before he banged his head.';
332+
expected = [ 'I said "Look out!" right before he banged his head.' ];
333+
actual = sentencize( str );
334+
t.deepEqual( actual, expected, 'keeps sentence with exclamation in quotes together' );
335+
336+
str = 'He asked "What time is it?" before leaving.';
337+
expected = [ 'He asked "What time is it?" before leaving.' ];
338+
actual = sentencize( str );
339+
t.deepEqual( actual, expected, 'keeps sentence with question mark in quotes together' );
340+
341+
str = '"Stop!" he yelled. "We need to think about this."';
342+
expected = [ '"Stop!" he yelled.', '"We need to think about this."' ];
343+
actual = sentencize( str );
344+
t.deepEqual( actual, expected, 'correctly splits multiple quoted sentences' );
345+
346+
str = 'She said "This is great!" and smiled.';
347+
expected = [ 'She said "This is great!" and smiled.' ];
348+
actual = sentencize( str );
349+
t.deepEqual( actual, expected, 'keeps sentence with exclamation in middle quotes together' );
350+
351+
str = '"Is this correct?" he wondered. "I think so!" she replied.';
352+
expected = [ '"Is this correct?" he wondered.', '"I think so!" she replied.' ];
353+
actual = sentencize( str );
354+
t.deepEqual( actual, expected, 'correctly handles multiple quoted sentences with different punctuation' );
355+
356+
t.end();
357+
});

0 commit comments

Comments
 (0)