diff --git a/lib/node_modules/@stdlib/nlp/sentencize/lib/main.js b/lib/node_modules/@stdlib/nlp/sentencize/lib/main.js index 8cf46b32640f..e41feaf0130a 100644 --- a/lib/node_modules/@stdlib/nlp/sentencize/lib/main.js +++ b/lib/node_modules/@stdlib/nlp/sentencize/lib/main.js @@ -27,11 +27,13 @@ var trim = require( '@stdlib/string/base/trim' ); // VARIABLES // +var RE_LOWERCASE = /^[a-z]+$/; var RE_CAPITALIZED = /^[A-Z][a-z]{0,4}$/; var RE_CAPITALIZED_PERIOD = /^([A-Z]\.)*[A-Z]$/; var RE_NUMBER = /^[0-9]$/; var RE_PREFIXES = /^[{[(<:;"'”`]/; var RE_SUFFIXES = /[})\]>:;"'”`]$/; +var RE_QUOTES = /^["'`]$/; // FUNCTIONS // @@ -51,10 +53,38 @@ var RE_SUFFIXES = /[})\]>:;"'”`]$/; * @returns {boolean} boolean indicating whether the token at a specified index is an end-of-sentence token */ function isEndOfSentence( tokens, i ) { + var nextToken; var token; var im1 = i - 1; var ip1 = i + 1; + token = tokens[ i ]; + + // Handle quoted text with punctuation... + if ( + RE_QUOTES.test( token ) && + i > 0 && + ( tokens[ im1 ] === '.' || tokens[ im1 ] === '!' || tokens[ im1 ] === '?' ) + ) { + // Look ahead to see if sentence continues: + ip1 = i + 1; + if ( ip1 < tokens.length ) { + // Skip spaces... + while ( ip1 < tokens.length && tokens[ ip1 ] === ' ' ) { + ip1 += 1; + } + // If next non-space token is lowercase, we assume the sentence continues: + if ( ip1 < tokens.length ) { + nextToken = tokens[ ip1 ]; + if ( RE_LOWERCASE.test( nextToken ) ) { + return false; + } + } + } + return true; + } + + // Regular sentence ending punctuation... if ( token === '.' && !RE_CAPITALIZED.test( tokens[ im1 ] ) && // for other short abbreviations and bullet points @@ -73,12 +103,6 @@ function isEndOfSentence( tokens, i ) { ) { return true; } - if ( - RE_SUFFIXES.test( token ) && - ( tokens[ im1 ] === '.' || tokens[ im1 ] === '!' || tokens[ im1 ] === '?' ) - ) { - return true; - } return false; } @@ -112,6 +136,7 @@ function sentencize( str ) { var tokens; var out; var i; + if ( !isString( str ) ) { throw new TypeError( 'invalid argument. Must provide a string. Value: `' + str + '`.' ); } diff --git a/lib/node_modules/@stdlib/nlp/sentencize/test/test.js b/lib/node_modules/@stdlib/nlp/sentencize/test/test.js index 304534773b85..95e641e770c0 100644 --- a/lib/node_modules/@stdlib/nlp/sentencize/test/test.js +++ b/lib/node_modules/@stdlib/nlp/sentencize/test/test.js @@ -317,3 +317,41 @@ tape( 'the function returns an empty array if provided an empty string', functio t.equal( out.length, 0, 'array length is zero' ); t.end(); }); + +tape( 'the function correctly handles punctuation within quotation marks', function test( t ) { + var expected; + var actual; + var str; + + str = 'I said "Look out" right before he banged his head.'; + expected = [ 'I said "Look out" right before he banged his head.' ]; + actual = sentencize( str ); + t.deepEqual( actual, expected, 'keeps sentence with simple quotes together' ); + + str = 'I said "Look out!" right before he banged his head.'; + expected = [ 'I said "Look out!" right before he banged his head.' ]; + actual = sentencize( str ); + t.deepEqual( actual, expected, 'keeps sentence with exclamation in quotes together' ); + + str = 'He asked "What time is it?" before leaving.'; + expected = [ 'He asked "What time is it?" before leaving.' ]; + actual = sentencize( str ); + t.deepEqual( actual, expected, 'keeps sentence with question mark in quotes together' ); + + str = '"Stop!" he yelled. "We need to think about this."'; + expected = [ '"Stop!" he yelled.', '"We need to think about this."' ]; + actual = sentencize( str ); + t.deepEqual( actual, expected, 'correctly splits multiple quoted sentences' ); + + str = 'She said "This is great!" and smiled.'; + expected = [ 'She said "This is great!" and smiled.' ]; + actual = sentencize( str ); + t.deepEqual( actual, expected, 'keeps sentence with exclamation in middle quotes together' ); + + str = '"Is this correct?" he wondered. "I think so!" she replied.'; + expected = [ '"Is this correct?" he wondered.', '"I think so!" she replied.' ]; + actual = sentencize( str ); + t.deepEqual( actual, expected, 'correctly handles multiple quoted sentences with different punctuation' ); + + t.end(); +});