diff --git a/OpenGrok b/OpenGrok index 761481792bd..b007a38289b 100755 --- a/OpenGrok +++ b/OpenGrok @@ -93,6 +93,8 @@ Supported Environment Variables for configuring the default setup: - OPENGROK_PROGRESS Shows progress in %(percentage) of working through project. It's good to have Verbose Mode enabled too. (*) + - OPENGROK_ALL_NONWHITESPACE Index all non-whitespace for FULL queries. + on|off (default off) (^) - OPENGROK_RENAMED_FILES_HISTORY Get full history of renamed files for SCMs that support it (Git, Mercurial). When set to on, the indexing is slower, especially in the @@ -450,6 +452,12 @@ ${BZR:+-Dorg.opensolaris.opengrok.history.Bazaar=$BZR} \ ASSIGNMENTS="`echo $OPENGROK_ASSIGNMENTS | sed 's/[:space:]+/_/g'`" ASSIGNMENTS="-A `echo $ASSIGNMENTS | sed 's/,/ -A /g'`" fi + + OPENGROK_ALL_NONWHITESPACE="${OPENGROK_ALL_NONWHITESPACE:-off}" + case "$OPENGROK_ALL_NONWHITESPACE" in + on|true|1) ALL_NONWHITESPACE="--allNonWhitespace on" ;; + *) ALL_NONWHITESPACE="" ;; + esac } # @@ -898,6 +906,7 @@ CommonInvocation() ${OPENGROK_FLUSH_RAM_BUFFER_SIZE} ${SKIN} ${LEADING_WILDCARD} \ ${OPENGROK_PARALLELISM:+--threads} ${OPENGROK_PARALLELISM} \ ${ASSIGNMENTS} \ + ${ALL_NONWHITESPACE} \ ${READ_XML_CONF} \ ${WEBAPP_CONFIG} \ ${OPENGROK_PROFILER:+--profiler} \ diff --git a/build.xml b/build.xml index 3e6c9867a3f..9b0e30d06c1 100644 --- a/build.xml +++ b/build.xml @@ -367,7 +367,12 @@ Portions Copyright (c) 2017-2018, Chris Fraire . - + + + + + + diff --git a/opengrok-indexer/pom.xml b/opengrok-indexer/pom.xml index b9b35236265..c47a9682290 100644 --- a/opengrok-indexer/pom.xml +++ b/opengrok-indexer/pom.xml @@ -200,6 +200,13 @@ Portions Copyright (c) 2017-2018, Chris Fraire . *.java + + org/opensolaris/opengrok/analysis/plain/ + ../test/org/opensolaris/opengrok/analysis/plain/ + + *.java + + org/opensolaris/opengrok/analysis/powershell/ ../test/org/opensolaris/opengrok/analysis/powershell/ diff --git a/src/org/opensolaris/opengrok/analysis/CompatibleAnalyser.java b/src/org/opensolaris/opengrok/analysis/CompatibleAnalyser.java index 09b1c745387..96f9dbbc3b0 100644 --- a/src/org/opensolaris/opengrok/analysis/CompatibleAnalyser.java +++ b/src/org/opensolaris/opengrok/analysis/CompatibleAnalyser.java @@ -17,9 +17,9 @@ * CDDL HEADER END */ - /* +/* * Copyright (c) 2005, 2017, Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2017, Chris Fraire . + * Portions Copyright (c) 2017-2018, Chris Fraire . */ package org.opensolaris.opengrok.analysis; @@ -40,29 +40,30 @@ public CompatibleAnalyser() { protected TokenStreamComponents createComponents(String fieldName) { switch (fieldName) { case QueryBuilder.FULL: - return new TokenStreamComponents(createPlainFullTokenizer()); + return new TokenStreamComponents( + createNonWhitespaceFullTokenizer()); case QueryBuilder.REFS: - return new TokenStreamComponents(createPlainSymbolTokenizer()); + return new TokenStreamComponents( + createNonWhitespaceSymbolTokenizer()); case QueryBuilder.DEFS: - return new TokenStreamComponents(createPlainSymbolTokenizer()); + return new TokenStreamComponents( + createNonWhitespaceSymbolTokenizer()); case QueryBuilder.PATH: case QueryBuilder.PROJECT: return new TokenStreamComponents(new PathTokenizer()); case QueryBuilder.HIST: return new HistoryAnalyzer().createComponents(fieldName); default: - return new TokenStreamComponents(createPlainFullTokenizer()); + return new TokenStreamComponents( + createPlainFullTokenizer(TokenizerMode.SYMBOLS_ONLY)); } } - private JFlexTokenizer createPlainSymbolTokenizer() { - return new JFlexTokenizer(new PlainSymbolTokenizer( - FileAnalyzer.dummyReader)); - } - - private JFlexTokenizer createPlainFullTokenizer() { - return new JFlexTokenizer(new PlainFullTokenizer( + private JFlexTokenizer createPlainFullTokenizer(TokenizerMode mode) { + JFlexTokenizer tokenizer = new JFlexTokenizer(new PlainFullTokenizer( FileAnalyzer.dummyReader)); + tokenizer.setTokenizerMode(mode); + return tokenizer; } @Override @@ -75,4 +76,15 @@ protected TokenStream normalize(String fieldName, TokenStream in) { return new LowerCaseFilter(in); } } + + private JFlexTokenizer createNonWhitespaceFullTokenizer() { + return createPlainFullTokenizer(TokenizerMode.NON_WHITESPACE_ONLY); + } + + private JFlexTokenizer createNonWhitespaceSymbolTokenizer() { + JFlexTokenizer tokenizer = new JFlexTokenizer(new PlainSymbolTokenizer( + FileAnalyzer.dummyReader)); + tokenizer.setTokenizerMode(TokenizerMode.NON_WHITESPACE_ONLY); + return tokenizer; + } } \ No newline at end of file diff --git a/src/org/opensolaris/opengrok/analysis/FileAnalyzer.java b/src/org/opensolaris/opengrok/analysis/FileAnalyzer.java index a232f609765..9e0809442a7 100644 --- a/src/org/opensolaris/opengrok/analysis/FileAnalyzer.java +++ b/src/org/opensolaris/opengrok/analysis/FileAnalyzer.java @@ -66,6 +66,7 @@ public class FileAnalyzer extends Analyzer { protected Ctags ctags; protected boolean scopesEnabled; protected boolean foldingEnabled; + protected boolean allNonWhitespace; private final FileAnalyzerFactory factory; /** @@ -169,6 +170,14 @@ public void setFoldingEnabled(boolean foldingEnabled) { this.foldingEnabled = supportsScopes() && foldingEnabled; } + /** + * Sets a value indicating if all non-whitespace should be indexed for + * FULL search. Default is false. + */ + public void setAllNonWhitespace(boolean value) { + this.allNonWhitespace = value; + } + protected boolean supportsScopes() { return false; } @@ -319,8 +328,13 @@ private JFlexTokenizer createPlainSymbolTokenizer() { } private JFlexTokenizer createPlainFullTokenizer() { - return new JFlexTokenizer(new PlainFullTokenizer( + JFlexTokenizer tokenizer = new JFlexTokenizer(new PlainFullTokenizer( FileAnalyzer.dummyReader)); + tokenizer.setTokenizerModeSupplier(() -> { + return allNonWhitespace ? TokenizerMode.SYMBOLS_AND_NON_WHITESPACE : + TokenizerMode.SYMBOLS_ONLY; + }); + return tokenizer; } @Override diff --git a/src/org/opensolaris/opengrok/analysis/JFlexSymbolMatcher.java b/src/org/opensolaris/opengrok/analysis/JFlexSymbolMatcher.java index 41e23ce06a9..730f0d866ed 100644 --- a/src/org/opensolaris/opengrok/analysis/JFlexSymbolMatcher.java +++ b/src/org/opensolaris/opengrok/analysis/JFlexSymbolMatcher.java @@ -38,6 +38,16 @@ public abstract class JFlexSymbolMatcher extends JFlexStateStacker private NonSymbolMatchedListener nonSymbolListener; private String disjointSpanClassName; + /** + * Gets a value indicating if the matcher is by-default case-insensitive -- + * i.e. whether tokens should be lower-cased when published in a stream. + * @return {@code false} but subclasses should override where necessary + */ + @Override + public boolean isDefaultCaseInsensitive() { + return false; + } + /** * Associates the specified listener, replacing the former one. * @param l defined instance @@ -78,6 +88,27 @@ public void clearNonSymbolMatchedListener() { nonSymbolListener = null; } + /** + * Does nothing. Subclasses can override if necessary to alter their + * behavior for different modes. + */ + @Override + public void setTokenizerMode(TokenizerMode value) { + } + + /** + * Does nothing. Subclasses can override to determines if {@code str} + * starts with a contraction (i.e., a word containing letters and non-word + * characters such as "ain't") according to the specific language. + * @param str a defined instance + * @return 0 if {@code str} does not start with a contraction; or else the + * length of the longest initial contraction + */ + @Override + public int getLongestContractionPrefix(String str) { + return 0; + } + /** * Gets the class name value from the last call to * {@link #onDisjointSpanChanged(java.lang.String, int)}. @@ -103,6 +134,24 @@ protected void onSymbolMatched(String str, int start) { } } + /** + * Raises + * {@link SymbolMatchedListener#symbolMatched(org.opensolaris.opengrok.analysis.SymbolMatchedEvent)} + * for a subscribed listener. + * @param literal the literal representation of the symbol + * @param str the symbol string + * @param start the symbol literal start position + */ + protected void onSymbolMatched(String literal, String str, int start) { + SymbolMatchedListener l = symbolListener; + if (l != null) { + // TODO: publish literal through SymbolMatchedEvent. + SymbolMatchedEvent evt = new SymbolMatchedEvent(this, str, start, + start + literal.length()); + l.symbolMatched(evt); + } + } + /** * Raises * {@link SymbolMatchedListener#sourceCodeSeen(org.opensolaris.opengrok.analysis.SourceCodeSeenEvent)} @@ -122,9 +171,11 @@ protected void onSourceCodeSeen(int start) { * {@link String#valueOf(char)} {@code c} and {@code start}. * @param c the text character * @param start the text start position + * @return {@code true} if one or more complete tokens were published from + * the text */ - protected void onNonSymbolMatched(char c, int start) { - onNonSymbolMatched(String.valueOf(c), start); + protected boolean onNonSymbolMatched(char c, int start) { + return onNonSymbolMatched(String.valueOf(c), start); } /** @@ -133,14 +184,18 @@ protected void onNonSymbolMatched(char c, int start) { * for a subscribed listener. * @param str the text string * @param start the text start position + * @return {@code true} if one or more complete tokens were published from + * the text */ - protected void onNonSymbolMatched(String str, int start) { + protected boolean onNonSymbolMatched(String str, int start) { NonSymbolMatchedListener l = nonSymbolListener; if (l != null) { TextMatchedEvent evt = new TextMatchedEvent(this, str, start, start + str.length()); l.nonSymbolMatched(evt); + return evt.isPublished(); } + return false; } /** @@ -150,15 +205,19 @@ protected void onNonSymbolMatched(String str, int start) { * @param str the text string * @param hint the text hint * @param start the text start position + * @return {@code true} if one or more complete tokens were published from + * the text */ - protected void onNonSymbolMatched(String str, EmphasisHint hint, - int start) { + protected boolean onNonSymbolMatched(String str, EmphasisHint hint, + int start) { NonSymbolMatchedListener l = nonSymbolListener; if (l != null) { TextMatchedEvent evt = new TextMatchedEvent(this, str, hint, start, start + str.length()); l.nonSymbolMatched(evt); + return evt.isPublished(); } + return false; } /** diff --git a/src/org/opensolaris/opengrok/analysis/JFlexTokenizer.java b/src/org/opensolaris/opengrok/analysis/JFlexTokenizer.java index b6a3f2f03e3..00d48b628e3 100644 --- a/src/org/opensolaris/opengrok/analysis/JFlexTokenizer.java +++ b/src/org/opensolaris/opengrok/analysis/JFlexTokenizer.java @@ -24,24 +24,187 @@ package org.opensolaris.opengrok.analysis; import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Deque; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Locale; +import java.util.Set; +import java.util.function.Supplier; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.opensolaris.opengrok.analysis.plain.PlainFullTokenizer; /** + * Represents a {@link Tokenizer} subclass that listens to OpenGrok language + * lexers to produce token streams for indexing. + *

* Created on August 24, 2009 - * * @author Lubos Kosco */ -public class JFlexTokenizer extends Tokenizer - implements SymbolMatchedListener { +public class JFlexTokenizer extends Tokenizer implements SymbolMatchedListener, + NonSymbolMatchedListener { + + /** + * To avoid over-indexing on pathologically long non-whitespace strings, + * limit the number of sub-strings that will be indexed (for every + * contiguous segment of non-whitespace). + */ + private static final int MAX_NONWHITESPACE_SUBSTRINGS = 64; + + /** + * To allow for some discarded tokens on pathologically long non-whitespace + * strings (e.g. if length is longer than {@link #MAX_TOKEN_CHARS}), limit + * the attempts to produce sub-strings (for every contiguous segment of + * non-whitespace) as some number higher than + * {@link #MAX_NONWHITESPACE_SUBSTRINGS}. + */ + private static final int MAX_NONWHITESPACE_SUBSTRING_TRIES = + MAX_NONWHITESPACE_SUBSTRINGS + 10; + + /** + * Defines a limit of token string size to avoid indexing pathologically + * long captures. (This is number of characters, not UTF-8 bytes, so a + * power-of-2 has no benefit). + */ + private static final int MAX_TOKEN_CHARS = 1000; + + /** + * Matches a sub-string that starts at: 1) a word character following a word + * boundary; or 2) that starts at a character _c_ that is not a quote, + * nor apostrophe, nor Unicode "Punctuation, Close" where _c_ follows a + * quote or apostrophe; or 3) that starts at a character _d_ following a + * "Punctuation, Open" character where _d_ is not "Punctuation, Open" nor + * "Punctuation, Close" -- and includes all remaining characters: + *

+     * {@code
+     * (?Ux) (?:\b\w |  #1
+     *     [^"'\p{gc=Pe}](?<=["'].) |  #2
+     *     [^\p{gc=Ps}\p{gc=Pe}](?<=\p{gc=Ps}. )  #3
+     *     ).*
+     * }
+     * 
+ * (Edit above and paste below [in NetBeans] for easy String escaping.) + */ + private static final Pattern WORDPLUS = Pattern.compile( + "(?Ux) (?:\\b\\w |" + + " [^\"'\\p{gc=Pe}](?<=[\"'].) |" + + " [^\\p{gc=Ps}\\p{gc=Pe}](?<=\\p{gc=Ps}.)" + + " ).*"); + + /** + * Matches: 1) a word boundary following a word character or a + * non-word/non-full-stop character; or 2) a quote or apostrophe following a + * non-quote or -apostrophe; or 3) a non-"Punctuation, Close" following a + * "Punctuation, Close"; or 4) a Unicode "Punctuation, Close" character + * following another character (the "following another character" is + * implicit because {@link #PAST_PHRASE} is only matched for offset + * >= 1). + *
+     * {@code
+     * (?Ux) \b(?<=\w|[^\w\.]) |  #1
+     *     ["'](?<=[^"'].) |  #2
+     *     \P{gc=Pe}(?<=\p{gc=Pe}.) |  #3
+     *     \p{gc=Pe}  #4
+     * }
+     * 
+ * (Edit above and paste below [in NetBeans] for easy String escaping.) + */ + private static final Pattern PAST_PHRASE = Pattern.compile( + "(?Ux) \\b(?<=\\w|[^\\w\\.]) |" + + " [\"'](?<=[^\"'].) |" + + " \\P{gc=Pe}(?<=\\p{gc=Pe}.) |" + + " \\p{gc=Pe}"); + + /** + * Matches a string that is all non-word characters. + *
+     * {@code
+     * (?U)^\W*$
+     * }
+     * 
+ * (Edit above and paste below [in NetBeans] for easy String escaping.) + */ + private static final Pattern ALL_NONWORD = Pattern.compile("(?U)^\\W*$"); private final ScanningSymbolMatcher matcher; + private final CharTermAttribute termAtt = addAttribute( + CharTermAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute( + OffsetAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute( + PositionIncrementAttribute.class); + + /** + * Defines the ultimate queue of tokens to be produced by + * {@link #incrementToken()}. + */ + private final Deque events = new LinkedList<>(); + + /** + * Tracks unique pending tokens in {@link #events} (which is not + * necessarily unique across the document but close enough for this + * class's purposes). + */ + private final Set eventsSet = new HashSet<>(); + + /** + * When {@link TokenizerMode} allows overlapping tokens, in order to avoid a + * Lucene {@link IllegalArgumentException} related to mis-ordered offsets + * ("... offsets must not go backwards"), tokens are accumulated in the + * following list until an indication that local overlapping is detected. + * Then the following list will be sorted and its tokens queued to + * {@link events}. + */ + private final List eventHopper = new ArrayList<>(); + + private final List snapshotEvents = new ArrayList<>(); + + /** + * Tracks unique symbol tokens -- until the next {@link #reset()}. + */ + private final Set symbolsSet = new HashSet<>(); + + /** + * Tracks a transient list of sub-strings of a string, where the sub-strings + * start at different left positions and use the entire rest of the + * original string. E.g., {@code "func(a,b)"} -> + * {@code ["func(a,b)", "a,b)", "b)"]}. + */ + private final List lsubs = new ArrayList<>(); + + private final StringBuilder nonWhitespaceBuilder = new StringBuilder(); + + private int nonWhitespaceOff = -1; + + private TokenizerMode mode = TokenizerMode.SYMBOLS_ONLY; + + private Supplier modeGetter; + + private PendingToken lastPublished; + + /** initialized lazily as needed */ + private PlainFullTokenizer plainTokenizer; + private int plainTokenizerOffset; + + private boolean snapshotting; + private int snapshotEventCount; + private int snapshotEventHopperCount; + + private boolean caseInsensitive; + /** * Initialize an instance, passing a {@link ScanningSymbolMatcher} which - * will be owned by the {@link JFlexTokenizer}. + * will be owned by the {@link JFlexTokenizer}. The tokenizer's + * {@link #isCaseInsensitive()} will be set to + * {@link ScanningSymbolMatcher#isDefaultCaseInsensitive()}. * @param matcher a defined instance */ public JFlexTokenizer(ScanningSymbolMatcher matcher) { @@ -50,11 +213,78 @@ public JFlexTokenizer(ScanningSymbolMatcher matcher) { } this.matcher = matcher; matcher.setSymbolMatchedListener(this); + matcher.setNonSymbolMatchedListener(this); // The tokenizer will own the matcher, so we won't have to unsubscribe. + + caseInsensitive = matcher.isDefaultCaseInsensitive(); + } + + /** + * Gets a value indicating how the tokenizer tracks symbols and -- if the + * {@link ScanningSymbolMatcher} supports it -- contiguous, non-whitespace + * sub-strings. Default is {@link TokenizerMode#SYMBOLS_ONLY}. + */ + public TokenizerMode getTokenizerMode() { + return mode; + } + + /** + * Sets a value indicating how the tokenizer tracks symbols and -- if the + * {@link ScanningSymbolMatcher} supports it -- contiguous, non-whitespace + * sub-strings. + */ + public void setTokenizerMode(TokenizerMode value) { + if (value != this.mode) { + this.mode = value; + if (value == TokenizerMode.SYMBOLS_ONLY) { + nonWhitespaceBuilder.setLength(0); + nonWhitespaceOff = -1; + } + matcher.setTokenizerMode(value); + } + } + + /** + * Sets an object for deferring the setting of + * {@link #setTokenizerMode(org.opensolaris.opengrok.analysis.TokenizerMode)} + * during {@link #reset()} -- e.g. when the {@link JFlexTokenizer} is + * passed off to a higher-level Lucene object. + * @param getter a defined instance or {@code null} to set + * {@link #getTokenizerMode()} to its default value + */ + public void setTokenizerModeSupplier(Supplier getter) { + this.modeGetter = getter; + if (getter != null) { + setTokenizerMode(getter.get()); + } else { + setTokenizerMode(TokenizerMode.SYMBOLS_ONLY); + } + } + + /** + * Gets a value indicating if published tokens are lower-cased so a token + * stream will be case-insensitive. Default is {@code false}. + */ + public boolean isCaseInsensitive() { + return caseInsensitive; + } + + /** + * Sets a value indicating if published tokens are lower-cased so a token + * stream will be case-insensitive. + */ + public void setCaseInsensitive(boolean value) { + caseInsensitive = value; } /** * Resets the instance and the instance's {@link ScanningSymbolMatcher}. + *

+ * N.b. {@link #getTokenizerMode()} is not affected unless + * {@link #setTokenizerModeSupplier(java.util.function.Supplier)} was called + * with a defined instance, and {@link #isCaseInsensitive()} is not + * affected. + *

* If necessary, users should have first called this instance's * {@link #setReader(java.io.Reader)} since the matcher will be * reset to the current reader. @@ -63,9 +293,27 @@ public JFlexTokenizer(ScanningSymbolMatcher matcher) { @Override public void reset() throws IOException { super.reset(); + + clearAttributes(); + eventHopper.clear(); + events.clear(); + eventsSet.clear(); + lastPublished = null; + // `lsubs' is managed exclusively by addNonWhitespaceSubstrings(). + // `mode' is (possibly) managed below. + nonWhitespaceBuilder.setLength(0); + nonWhitespaceOff = -1; + symbolsSet.clear(); + + snapshotStop(); + + Supplier getter = modeGetter; + if (getter != null) { + setTokenizerMode(getter.get()); + } + matcher.yyreset(input); matcher.reset(); - clearAttributes(); } /** @@ -78,34 +326,55 @@ public final void close() throws IOException { matcher.yyclose(); } - private final CharTermAttribute termAtt = addAttribute( - CharTermAttribute.class); - private final OffsetAttribute offsetAtt = addAttribute( - OffsetAttribute.class); - private final PositionIncrementAttribute posIncrAtt = addAttribute( - PositionIncrementAttribute.class); - /** - * This will re-initialize internal AttributeImpls, or it returns false if - * end of input Reader ... - * + * Executes {@link ScanningSymbolMatcher#yylex()} until either a token is + * produced or the EOF is reached; and calls + * {@link #setAttribs(org.opensolaris.opengrok.analysis.JFlexTokenizer.PendingToken)} + * upon the former. * @return false if no more tokens, otherwise true * @throws IOException in case of I/O error */ @Override public final boolean incrementToken() throws IOException { + while (events.isEmpty() && notEOF()) { + // just iterating + } + + if (!events.isEmpty()) { + PendingToken tok = events.remove(); + eventsSet.remove(tok); + setAttribs(tok); + lastPublished = tok; + return true; + } + clearAttributes(); - return matcher.yylex() != matcher.getYYEOF(); + lastPublished = null; + return false; } /** - * Calls {@link #setAttribs(java.lang.String, int, int)} on the publishing - * of a {@link SymbolMatchedEvent}. + * Enqueues a token on the publishing of a {@link SymbolMatchedEvent}, and + * does additional non-whitespace sub-string handling if + * {@link #getTokenizerMode()} is eligible. * @param evt the event raised */ @Override public void symbolMatched(SymbolMatchedEvent evt) { - setAttribs(evt.getStr(), evt.getStart(), evt.getEnd()); + switch (mode) { + case SYMBOLS_ONLY: + case SYMBOLS_AND_NON_WHITESPACE: + PendingToken tok = new PendingToken(evt.getStr(), + evt.getStart(), evt.getEnd()); + if (addEventToken(tok)) { + symbolsSet.add(tok); + } + break; + default: + break; + } + + onTextMatched(evt.getSource(), evt.getStr(), evt.getStart()); } /** @@ -117,19 +386,571 @@ public void sourceCodeSeen(SourceCodeSeenEvent evt) { } /** - * Clears, and then resets the instances attributes per the specified - * arguments. - * @param str the matched symbol - * @param start the match start position - * @param end the match end position + * Does non-whitespace sub-string handling if {@link #getTokenizerMode()} + * is eligible. + * @param evt the event raised + */ + @Override + public void nonSymbolMatched(TextMatchedEvent evt) { + boolean isPublished = onTextMatched(evt.getSource(), evt.getStr(), + evt.getStart()); + evt.setPublished(isPublished); + } + + /** + * Does non-whitespace sub-string handling if {@link #getTokenizerMode()} + * is eligible. + * @param evt the event raised + */ + @Override + public void keywordMatched(TextMatchedEvent evt) { + onTextMatched(evt.getSource(), evt.getStr(), evt.getStart()); + } + + /** + * Does non-whitespace sub-string handling if {@link #getTokenizerMode()} + * is eligible. + * @param evt the event raised + */ + @Override + public void endOfLineMatched(TextMatchedEvent evt) { + onTextMatched(evt.getSource(), evt.getStr(), evt.getStart()); + } + + /** + * Does nothing. + * @param evt ignored */ - protected void setAttribs(String str, int start, int end) { + @Override + public void disjointSpanChanged(DisjointSpanChangedEvent evt) { + } + + /** + * Does non-whitespace sub-string handling if {@link #getTokenizerMode()} + * is eligible. + * @param evt the event raised + */ + @Override + public void linkageMatched(LinkageMatchedEvent evt) { + onTextMatched(evt.getSource(), evt.getStr(), evt.getStart()); + } + + /** + * Does non-whitespace sub-string handling if {@link #getTokenizerMode()} + * is eligible. + * @param evt the event raised + */ + @Override + public void pathlikeMatched(PathlikeMatchedEvent evt) { + onTextMatched(evt.getSource(), evt.getStr(), evt.getStart()); + } + + /** + * Does non-whitespace sub-string handling if {@link #getTokenizerMode()} + * is eligible. + * @param evt the event raised + */ + @Override + public void scopeChanged(ScopeChangedEvent evt) { + onTextMatched(evt.getSource(), evt.getStr(), evt.getStart()); + } + + /** + * Clears, and then resets the instance's attributes per the specified + * argument. + * @param tok the matched token + */ + protected void setAttribs(PendingToken tok) { clearAttributes(); - //FIXME increasing below by one(default) might be tricky, need more analysis - // after lucene upgrade to 3.5 below is most probably not even needed - this.posIncrAtt.setPositionIncrement(1); + + this.posIncrAtt.setPositionIncrement(tok.nonpos ? 0 : 1); this.termAtt.setEmpty(); - this.termAtt.append(str); - this.offsetAtt.setOffset(start, end); + this.termAtt.append(tok.str); + this.offsetAtt.setOffset(tok.start, tok.end); + } + + /** + * If {@link #getTokenizerMode()} is eligible, then does handling w.r.t. + * {@link #addNonWhitespace()}. + * @return {@code true} if one or more complete tokens were published from + * the text + */ + private boolean onTextMatched(Object source, String str, int start) { + boolean ret = false; + + switch (mode) { + case NON_WHITESPACE_ONLY: + case SYMBOLS_AND_NON_WHITESPACE: + for (int i = 0; i < str.length(); ++i) { + char c = str.charAt(i); + if (Character.isWhitespace(c)) { + if (nonWhitespaceOff >= 0) { + addNonWhitespace(); + } + /** + * In OpenGrok, a symbol will never begin with a + * whitespace character, so whenever this method sees a + * text whitespace, that means any possible local + * overlapping of SYMBOLS_AND_NON_WHITESPACE is over. + * eventHopper is sorted and its contents published to + * events. + */ + if (emptyHopperToQueue()) { + ret = true; + } + } else if (nonWhitespaceOff < 0) { + nonWhitespaceOff = start + i; + nonWhitespaceBuilder.append(c); + } else { + nonWhitespaceBuilder.append(c); + } + } + break; + default: + break; + } + + return ret; + } + + /** + * Executes the {@link ScanningSymbolMatcher#yylex()}, and tests whether it + * returned {@link ScanningSymbolMatcher#getYYEOF()} -- if so, then + * any necessary, finishing operations are executed. + * @return {@code true} if {@code YYEOF} was not returned + */ + private boolean notEOF() throws IOException { + boolean isEOF = matcher.yylex() == matcher.getYYEOF(); + if (isEOF) { + if (nonWhitespaceOff >= 0) { + addNonWhitespace(); + } + emptyHopperToQueue(); + } + return !isEOF; + } + + /** + * If non-whitespace has been collected according to + * {@link #getTokenizerMode()}, then queue at least one + * {@link PendingToken} and possibly more according to + * {@link #getTokenizerMode()}. + * @return {@code true} if one or more complete tokens were published from + * the text + */ + private boolean addNonWhitespace() { + boolean ret = false; + + if (nonWhitespaceBuilder.length() > 0) { + String nonwhsp = nonWhitespaceBuilder.toString(); + nonWhitespaceBuilder.setLength(0); + + boolean abortedNonWhitespaceSubstrings = false; + /* + * In the most expansive mode, additional sub-strings within + * non-whitespace matches are also tokenized. + */ + if (mode == TokenizerMode.SYMBOLS_AND_NON_WHITESPACE) { + if (!addNonWhitespaceSubstrings(nonwhsp)) { + abortedNonWhitespaceSubstrings = true; + } + } + + if (!abortedNonWhitespaceSubstrings) { + PendingToken tok = new PendingToken(nonwhsp, nonWhitespaceOff, + nonWhitespaceOff + nonwhsp.length()); + ret = addEventToken(tok); + } + } + nonWhitespaceOff = -1; + return ret; + } + + /** + * Queues additional, word-boundary sub-string tokens found within + * {@code fullsub}. + */ + private boolean addNonWhitespaceSubstrings(String fullsub) { + if (fullsub.length() < 1) { + return false; + } + + snapshotStart(); + try { + return addNonWhitespaceSubstrings1(fullsub); + } finally { + snapshotStop(); + } + } + + /** + * Subordinate of {@link #addNonWhitespaceSubstrings(java.lang.String)}. + */ + private boolean addNonWhitespaceSubstrings1(String fullsub) { + lsubs.clear(); + /* + * Track a (not-published-here) entry for `fullsub' to be used for later + * iterations. + */ + lsubs.add(new PendingSub(fullsub, nonWhitespaceOff)); + + int successes = 0; + int tries = 0; + + /* + * Add any sub-strings of `fullsub' starting at word(plus)-boundaries + * except for a sub-string that is entirely `fullsub'. Avoid splitting + * any known language contractions by tracking `xlen0'. + */ + int xlen0 = matcher.getLongestContractionPrefix(fullsub); + int moff = 0; + Matcher lsubMatcher = WORDPLUS.matcher(fullsub); + while (lsubMatcher.find(moff)) { + String lsub = lsubMatcher.group(); + int loff = nonWhitespaceOff + lsubMatcher.start(); + if (loff > nonWhitespaceOff && loff >= nonWhitespaceOff + xlen0) { + // Extend the contraction-protection region if necessary. + xlen0 = lsubMatcher.start() + + matcher.getLongestContractionPrefix(lsub); + + PendingToken tok = new PendingToken(lsub, loff, loff + + lsub.length()); + lsubs.add(new PendingSub(lsub, loff)); + if (addEventToken(tok) && ++successes >= + MAX_NONWHITESPACE_SUBSTRINGS) { + snapshotRollback(); + addPlainTokens(fullsub, nonWhitespaceOff); + return false; + } + if (++tries >= MAX_NONWHITESPACE_SUBSTRING_TRIES) { + snapshotRollback(); + addPlainTokens(fullsub, nonWhitespaceOff); + return false; + } + } + moff = lsubMatcher.start() + 1; + } + + // Initialize PAST_PHRASE matchers in lsubs. + for (PendingSub psub : lsubs) { + psub.ender = PAST_PHRASE.matcher(psub.str); + psub.roff = 1; // Start looking past the 0th character. + psub.xlen = matcher.getLongestContractionPrefix(psub.str); + } + + /* + * Add any sub-strings of PendingSub-strings ending at phrase + * boundaries beyond the 0th character, short of the full length of the + * PendingSub-string, and beyond any known language contractions per + * `psub.xlen'. + * + * Because the number of sub-strings produced by this method is limited + * to MAX_NONWHITESPACE_SUBSTRINGS, iterate in a circular fashion + * through `lsubs' so that sub-strings (if limited) are spread evenly + * among `lsubs'. I.e., so that sub-strings aren't exhausted from the + * first entry of `lsubs' while the last one gets none. + */ + int pidx = -1; + while ((pidx = circlePendingSubs(pidx)) >= 0) { + PendingSub psub = lsubs.get(pidx); + boolean didAddToken = false; + while (psub.ender.find(psub.roff) && psub.ender.start() < + psub.str.length()) { + int pends = psub.ender.start(); + psub.roff = pends + 1; + if (pends >= psub.xlen) { + String lrsub = psub.str.substring(0, pends); + if (!isLoneNonWordlikeChar(lrsub)) { + PendingToken tok = new PendingToken(lrsub, psub.start, + psub.start + lrsub.length()); + if (addEventToken(tok)) { + if (++successes >= MAX_NONWHITESPACE_SUBSTRINGS) { + snapshotRollback(); + addPlainTokens(fullsub, nonWhitespaceOff); + return false; + } + // After one added token, break to circle in lsubs. + didAddToken = true; + break; + } + if (++tries >= MAX_NONWHITESPACE_SUBSTRING_TRIES) { + snapshotRollback(); + addPlainTokens(fullsub, nonWhitespaceOff); + return false; + } + } + } + } + if (!didAddToken) { + /** + * If no new token was got from `pidx', remove the entry from + * `lsubs' so the candidates shrink; and then move `pidx' back + * circularly to the previous entry. + */ + lsubs.remove(pidx); + pidx = -1 + (pidx > 0 ? pidx : lsubs.size()); + } + } + + return true; + } + + /** + * Queues the specified {@code tok} if: 1) its size is eligible; 2) + * {@code tok} is not equal to the last published token; and 3) {@code tok} + * is not present in the tracked set of symbol tokens nor in the tracked set + * of pending tokens. + */ + private boolean addEventToken(PendingToken tok) { + if (tok.str.length() > 0 && tok.str.length() <= MAX_TOKEN_CHARS) { + if (caseInsensitive) { + tok = new PendingToken(tok.str.toLowerCase(Locale.ROOT), + tok.start, tok.end); + } + + if ((lastPublished == null || !lastPublished.equals(tok)) && + !symbolsSet.contains(tok) && eventsSet.add(tok)) { + if (snapshotting) { + snapshotEvents.add(tok); + } + + switch (mode) { + case SYMBOLS_AND_NON_WHITESPACE: + /** + * In OpenGrok, a symbol will never begin with a + * whitespace character, so whenever this method sees a + * token starting with whitespace, that means any + * possible local overlapping of + * SYMBOLS_AND_NON_WHITESPACE is over. eventHopper is + * sorted and its contents published to events. + */ + if (Character.isWhitespace(tok.str.charAt(0))) { + emptyHopperToQueue(); + events.add(tok); + if (snapshotting) { + ++snapshotEventCount; + } + return true; + } else { + eventHopper.add(tok); + if (snapshotting) { + ++snapshotEventHopperCount; + } + return true; + } + default: + events.add(tok); + if (snapshotting) { + ++snapshotEventCount; + } + return true; + } + } + } + + return false; + } + + /** + * Moves all elements from {@link #eventHopper} to {@link #events} after + * first ordering the former and determining position increment values for + * the ordered elements relative to their predecessors in the hopper. + */ + private boolean emptyHopperToQueue() { + if (eventHopper.size() < 1) { + return false; + } + + if (eventHopper.size() == 1) { + events.add(eventHopper.get(0)); + eventHopper.clear(); + if (snapshotting) { + ++snapshotEventCount; + --snapshotEventHopperCount; + } + return true; + } + + eventHopper.sort(PendingTokenOffsetsComparator.INSTANCE); + int lastNewStart = -1; + int lastNewPos = -1; + String presentWord = ""; + String lastWord = ""; + for (PendingToken ntok : eventHopper) { + events.add(ntok); + if (snapshotting) { + snapshotEvents.add(ntok); + } + + // When PendingToken `start' changes, begin a new `presentWord'. + if (ntok.start != lastNewStart) { + lastNewStart = ntok.start; + presentWord = ntok.str; + } else if (ntok.str.length() > presentWord.length()) { + // Extend `presentWord' to a longer value. + presentWord = ntok.str; + } + + /** + * Track `lastNewPos' to indicate the start position of last token + * with a non-zero position increment. After `lastNewPos' is first + * defined, the current token may have a position increment of zero + * or non-zero depending on the presence (or not) of only non-word + * characters between the current token and `lastNewPos'. + * + * This is meant to allow phrase comparisons when minor interleaving + * punctuation is present; e.g. so that a match can occur for + * "contains some strange" against the source text: + * "contains some 'strange' characters". + */ + + if (lastNewPos < 0) { + lastNewPos = ntok.start; + lastWord = presentWord; + } else if (ntok.start == lastNewPos) { + ntok.nonpos = true; + lastWord = presentWord; + } else { + /** + * With overlapping tokens, if the sub-string preceding `ntok' + * is all non-word characters, then also set `nonpos' to true. + */ + if (lastWord.length() >= ntok.end - lastNewPos) { + String lastLede = lastWord.substring(0, ntok.start - + lastNewPos); + if (allNonWord(lastLede)) { + ntok.nonpos = true; + } else { + lastNewPos = ntok.start; + lastWord = presentWord; + } + } else { + lastNewPos = ntok.start; + lastWord = presentWord; + } + } + } + if (snapshotting) { + snapshotEventCount += eventHopper.size(); + snapshotEventHopperCount -= eventHopper.size(); + } + eventHopper.clear(); + return true; + } + + /** + * Determines if {@code pword} consists of all non-word characters (or + * likewise is empty). + */ + private boolean allNonWord(String pword) { + return ALL_NONWORD.matcher(pword).matches(); + } + + /** + * Determines if {@code str} is a single character that is not a letter, + * digit, or underscore. + */ + private boolean isLoneNonWordlikeChar(String str) { + if (str.length() != 1) { + return false; + } + char c = str.charAt(0); + return c != '_' && !Character.isLetterOrDigit(c); + } + + /** + * Iterates through {@link #lsubs} in a circular fashion. + */ + private int circlePendingSubs(int currentIdx) { + int csize = lsubs.size(); + if (currentIdx == -1 || ++currentIdx >= csize) { + return csize > 0 ? 0 : -1; + } + return currentIdx; + } + + /** + * Fallback to PlainFullTokenizer after + * {@link #addNonWhitespaceSubstrings(java.lang.String)} hit limit of + * {@link #MAX_NONWHITESPACE_SUBSTRINGS} or + * {@link #MAX_NONWHITESPACE_SUBSTRING_TRIES}. + */ + private void addPlainTokens(String fullsub, int offset) { + plainTokenizerOffset = offset; + if (plainTokenizer == null) { + plainTokenizer = new PlainFullTokenizer(FileAnalyzer.dummyReader); + plainTokenizer.setSymbolMatchedListener( + new SymbolMatchedListener() { + @Override + public void symbolMatched(SymbolMatchedEvent evt) { + addEventToken(new PendingToken(evt.getStr(), + plainTokenizerOffset + evt.getStart(), + plainTokenizerOffset + evt.getEnd())); + } + + @Override + public void sourceCodeSeen(SourceCodeSeenEvent evt) {} + }); + } + + try (StringReader rdr = new StringReader(fullsub)) { + plainTokenizer.yyreset(rdr); + try { + while (plainTokenizer.yylex() != plainTokenizer.getYYEOF()) {} + } catch (IOException ex) { + // IOException not expected with StringReader + } + } + } + + /** + * Unwinds events which were accounted for since the last, implicit call to + * {@link #snapshotInit()} (or since the instance was constructed); and + * then call {@link #snapshotStop()}. + */ + private void snapshotRollback() { + while (snapshotEventCount-- > 0) { + events.removeLast(); + } + while (snapshotEventHopperCount-- > 0) { + eventHopper.remove(eventHopper.size() - 1); + } + for (PendingToken evt : snapshotEvents) { + eventsSet.remove(evt); + } + snapshotStop(); + } + + /** + * Called implicitly via {@link #snapshotStart()} and + * {@link #snapshotStop()} + */ + private void snapshotInit() { + snapshotEvents.clear(); + snapshotEventCount = 0; + snapshotEventHopperCount = 0; + } + + private void snapshotStart() { + snapshotInit(); + snapshotting = true; + } + + private void snapshotStop() { + snapshotInit(); + snapshotting = false; + } + + private static class PendingSub { + public final String str; + public final int start; + public Matcher ender; + public int roff; + public int xlen; + + public PendingSub(String str, int start) { + this.str = str; + this.start = start; + } } } diff --git a/src/org/opensolaris/opengrok/analysis/ScanningSymbolMatcher.java b/src/org/opensolaris/opengrok/analysis/ScanningSymbolMatcher.java index 3154324ec66..951966d0a90 100644 --- a/src/org/opensolaris/opengrok/analysis/ScanningSymbolMatcher.java +++ b/src/org/opensolaris/opengrok/analysis/ScanningSymbolMatcher.java @@ -19,7 +19,7 @@ /* * Copyright (c) 2009, 2017, Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2017, Chris Fraire . + * Portions Copyright (c) 2017-2018, Chris Fraire . */ package org.opensolaris.opengrok.analysis; @@ -30,4 +30,26 @@ */ public interface ScanningSymbolMatcher extends JFlexStackingLexer, Resettable, SymbolMatchedPublisher { + + /** + * Gets a value indicating if the matcher is by-default case-insensitive -- + * i.e. whether tokens should be lower-cased when published in a stream. + */ + boolean isDefaultCaseInsensitive(); + + /** + * Implementers can override if necessary to alter their behavior for + * different modes. + */ + void setTokenizerMode(TokenizerMode value); + + /** + * Determines if {@code str} starts with a contraction (i.e., a word + * containing letters and non-word characters such as "ain't") according to + * the specific language. + * @param str a defined instance + * @return 0 if {@code str} does not start with a contraction; or else the + * length of the longest initial contraction + */ + int getLongestContractionPrefix(String str); } diff --git a/src/org/opensolaris/opengrok/analysis/TagDesc.java b/src/org/opensolaris/opengrok/analysis/TagDesc.java new file mode 100644 index 00000000000..ad4217f47ac --- /dev/null +++ b/src/org/opensolaris/opengrok/analysis/TagDesc.java @@ -0,0 +1,46 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2018, Chris Fraire . + */ + +package org.opensolaris.opengrok.analysis; + +/** + * Represents an immutable tuple of string-converted {@link Definitions.Tag} + * data for source-context presentations. + */ +public class TagDesc { + + public final String symbol; + public final String lineno; + public final String type; + public final String text; + public final String scope; + + public TagDesc(String symbol, String lineno, String type, String text, + String scope) { + this.symbol = symbol; + this.lineno = lineno; + this.type = type; + this.text = text; + this.scope = scope; + } +} diff --git a/src/org/opensolaris/opengrok/analysis/TextMatchedEvent.java b/src/org/opensolaris/opengrok/analysis/TextMatchedEvent.java index c3498b0a36f..f1a87596067 100644 --- a/src/org/opensolaris/opengrok/analysis/TextMatchedEvent.java +++ b/src/org/opensolaris/opengrok/analysis/TextMatchedEvent.java @@ -18,7 +18,7 @@ */ /* - * Copyright (c) 2017, Chris Fraire . + * Copyright (c) 2017-2018, Chris Fraire . */ package org.opensolaris.opengrok.analysis; @@ -34,9 +34,11 @@ public class TextMatchedEvent { private final EmphasisHint hint; private final int start; private final int end; + private boolean published; /** - * Initializes an immutable instance of {@link TextMatchedEvent}. + * Initializes an almost wholly immutable instance of + * {@link TextMatchedEvent} with {@link #isPublished} set to {@code false}. * @param source the event source * @param str the text string * @param start the text start position @@ -102,4 +104,20 @@ public int getEnd() { public EmphasisHint getHint() { return hint; } + + /** + * Gets a value indicating if one or more tokens were published from the + * text. + */ + public boolean isPublished() { + return published; + } + + /** + * Sets a value indicating if one or more tokens were published from the + * text. + */ + public void setPublished(boolean value) { + published = value; + } } diff --git a/src/org/opensolaris/opengrok/analysis/TokenizerMode.java b/src/org/opensolaris/opengrok/analysis/TokenizerMode.java new file mode 100644 index 00000000000..276e698dd58 --- /dev/null +++ b/src/org/opensolaris/opengrok/analysis/TokenizerMode.java @@ -0,0 +1,46 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2018, Chris Fraire . + */ + +package org.opensolaris.opengrok.analysis; + +/** + * Represents an enumeration of token-production modes. + */ +public enum TokenizerMode { + /** + * Only produces tokens raised via {@link SymbolMatchedEvent}. + */ + SYMBOLS_ONLY, + /** + * Only produces tokens resulting from analysis of contiguous, disjoint + * non-whitespace. + */ + NON_WHITESPACE_ONLY, + /** + * Produces tokens raised by {@link SymbolMatchedEvent} -- as well as those + * resulting from analysis of contiguous, disjoint non-whitespace plus + * word-boundary and opening- and closing-punctuation boundary sub-strings + * found therein. + */ + SYMBOLS_AND_NON_WHITESPACE +} diff --git a/src/org/opensolaris/opengrok/analysis/plain/PlainFullTokenizer.lex b/src/org/opensolaris/opengrok/analysis/plain/PlainFullTokenizer.lex index 29c31a9bfa8..256a9167516 100644 --- a/src/org/opensolaris/opengrok/analysis/plain/PlainFullTokenizer.lex +++ b/src/org/opensolaris/opengrok/analysis/plain/PlainFullTokenizer.lex @@ -19,13 +19,16 @@ /* * Copyright (c) 2005, 2017, Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2017, Chris Fraire . + * Portions Copyright (c) 2017-2018, Chris Fraire . */ package org.opensolaris.opengrok.analysis.plain; import java.util.Locale; +import java.util.regex.Pattern; import org.opensolaris.opengrok.analysis.JFlexSymbolMatcher; +import org.opensolaris.opengrok.analysis.TokenizerMode; +import org.opensolaris.opengrok.util.TextTrieMap; %% %public %class PlainFullTokenizer @@ -39,15 +42,115 @@ import org.opensolaris.opengrok.analysis.JFlexSymbolMatcher; %include CommonLexer.lexh %caseless %char +%{ + private static final Pattern POSSESSIVE = Pattern.compile("(?U)^\\w+'s$"); + + private static final TextTrieMap CONTRACTIONS; + + private final int[] contractionLength = new int[1]; + + private TokenizerMode mode = TokenizerMode.SYMBOLS_ONLY; + + static { + CONTRACTIONS = new TextTrieMap<>(); + for (String word : new String[] { + "ain't", "amn't", "aren't", "can't", "cain't", "could've", + "couldn't", "daren't", "daresn't", "dasn't", "didn't", + "doesn't", "don't", "e'er", "gonna", "gotta", "hadn't", + "hasn't", "haven't", "he'd", "he'll", "he's", "how'd", + "how'll", "how's", "i'd", "i'll", "i'm", "i'm'a", "i've", + "isn't", "it'd", "it'll", "it's", "let's", "ma'am", + "mayn't", "may've", "mightn't", "might've", "mustn't", + "must've", "needn't", "ne'er", "o'clock", "o'er", "ol'", + "oughtn't", "shan't", "she'd", "she'll", "she's", + "should've", "shouldn't", "somebody's", "someone's", + "something's", "that'll", "that're", "that's", "that'd", + "there'd", "there're", "there's", "these're", "they'd", + "they'll", "they're", "they've", "this's", "those're", + "wasn't", "we'd", "we'd've", "we'll", "we're", "we've", + "weren't", "what'd", "what'll", "what're", "what's", + "what've", "when's", "where'd", "where're", "where's", + "where've", "which's", "who'd", "who'd've", "who'll", + "who're", "who's", "who've", "why'd", "why're", "why's", + "won't", "would've", "wouldn't", "y'all", "you'd", "you'll", + "you're", "you've" }) { + CONTRACTIONS.put(word, word /* value is irrelevant */); + } + } + + /** + * Gets a value indicating if the matcher is by-default case-insensitive -- + * i.e. whether tokens should be lower-cased when published in a stream. + * @return {@code true} + */ + @Override + public boolean isDefaultCaseInsensitive() { + return true; + } + + /** + * {@link PlainFullTokenizer} alters its behavior for modes which track all + * non-whitespace so that its older parsing does not hurt newer support for + * more comprehensive non-whitespace breaking nor support for plain-text + * (English currently) contractions. + *

+ * The older symbol tokenization splits contractions such as "there's" into + * two tokens which can impact query-ability. + */ + @Override + public void setTokenizerMode(TokenizerMode value) { + mode = value; + } + + /** + * Determines if {@code str} starts with a known contraction from a limited + * collection of English words or is like a singular English possessive + * ending in "'s". + * @return 0 if {@code str} does not start with a contraction; or else the + * length of the longest initial contraction + */ + @Override + public int getLongestContractionPrefix(String str) { + String strlc = str.toLowerCase(Locale.ENGLISH); + if (CONTRACTIONS.get(str, 0, contractionLength) != null) { + return contractionLength[0]; + } + return POSSESSIVE.matcher(strlc).matches() ? str.length() : 0; + } +%} //WhiteSpace = [ \t\f\r]+|\n Identifier = [a-zA-Z\p{Letter}_] [a-zA-Z\p{Letter}0-9\p{Number}_]* Number = [0-9]+|[0-9]+\.[0-9]+| "0[xX]" [0-9a-fA-F]+ +// No letters in the following, so no toLowerCase() needed in handling. Printable = [\@\$\%\^\&\-+=\?\.\:] %% -{Identifier}|{Number}|{Printable} { // below assumes locale from the shell/container, instead of just US - onSymbolMatched(yytext().toLowerCase(Locale.getDefault()), yychar); - return yystate(); +{Identifier}|{Number}|{Printable} { + String capture = yytext(); + switch (mode) { + case SYMBOLS_AND_NON_WHITESPACE: + case NON_WHITESPACE_ONLY: + if (onNonSymbolMatched(capture, yychar)) { + return yystate(); + } + break; + default: + onSymbolMatched(capture, yychar); + return yystate(); + } +} +[^] { + // below assumes locale from the shell/container, instead of just US + switch (mode) { + case SYMBOLS_AND_NON_WHITESPACE: + case NON_WHITESPACE_ONLY: + if (onNonSymbolMatched(yytext(), yychar)) { + return yystate(); + } + break; + default: + // noop + break; + } } -[^] {} diff --git a/src/org/opensolaris/opengrok/analysis/plain/PlainSymbolTokenizer.lex b/src/org/opensolaris/opengrok/analysis/plain/PlainSymbolTokenizer.lex index 2ff7a68f964..41528f79067 100644 --- a/src/org/opensolaris/opengrok/analysis/plain/PlainSymbolTokenizer.lex +++ b/src/org/opensolaris/opengrok/analysis/plain/PlainSymbolTokenizer.lex @@ -19,12 +19,13 @@ /* * Copyright (c) 2005, 2017, Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2017, Chris Fraire . + * Portions Copyright (c) 2017-2018, Chris Fraire . */ package org.opensolaris.opengrok.analysis.plain; import org.opensolaris.opengrok.analysis.JFlexSymbolMatcher; +import org.opensolaris.opengrok.analysis.TokenizerMode; %% %public %class PlainSymbolTokenizer @@ -37,12 +38,43 @@ import org.opensolaris.opengrok.analysis.JFlexSymbolMatcher; %int %include CommonLexer.lexh %char +%{ + private TokenizerMode mode = TokenizerMode.SYMBOLS_ONLY; + + /** + * {@link PlainSymbolTokenizer} alters its behavior for modes which track + * all non-whitespace so that its older parsing does not hurt newer support + * for more comprehensive non-whitespace breaking. + */ + @Override + public void setTokenizerMode(TokenizerMode value) { + mode = value; + } +%} %% //TODO decide if we should let one char symbols [a-zA-Z_] [a-zA-Z0-9_]+ { - onSymbolMatched(yytext(), yychar); - return yystate(); + String capture = yytext(); + switch (mode) { + case SYMBOLS_AND_NON_WHITESPACE: + case NON_WHITESPACE_ONLY: + onNonSymbolMatched(capture, yychar); + break; + default: + onSymbolMatched(capture, yychar); + return yystate(); + } } -[^] {} +[^] { + switch (mode) { + case SYMBOLS_AND_NON_WHITESPACE: + case NON_WHITESPACE_ONLY: + onNonSymbolMatched(yytext(), yychar); + break; + default: + // noop + break; + } +} diff --git a/src/org/opensolaris/opengrok/analysis/uue/UuencodeFullTokenizer.lex b/src/org/opensolaris/opengrok/analysis/uue/UuencodeFullTokenizer.lex index cd2d3876d92..6228e48860d 100644 --- a/src/org/opensolaris/opengrok/analysis/uue/UuencodeFullTokenizer.lex +++ b/src/org/opensolaris/opengrok/analysis/uue/UuencodeFullTokenizer.lex @@ -20,7 +20,7 @@ /* * Copyright (c) 2005, 2017, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2013 Constantine A. Murenin - * Portions Copyright (c) 2017, Chris Fraire . + * Portions Copyright (c) 2017-2018, Chris Fraire . */ package org.opensolaris.opengrok.analysis.uue; @@ -43,6 +43,16 @@ import org.opensolaris.opengrok.analysis.JFlexSymbolMatcher; boolean modeFound; boolean nameFound; + /** + * Gets a value indicating if the matcher is by-default case-insensitive -- + * i.e. whether tokens should be lower-cased when published in a stream. + * @return {@code true} + */ + @Override + public boolean isDefaultCaseInsensitive() { + return true; + } + /** * Resets the uuencode tracked state after * {@link JFlexSymbolMatcher#reset()}. @@ -72,12 +82,12 @@ Printable = [\@\$\%\^\&\-+=\?\.\:] nameFound = false; yybegin(BEGIN); yypushback(1); - onSymbolMatched(yytext().toLowerCase(), yychar); + onSymbolMatched(yytext(), yychar); return yystate(); } {Identifier}|{Number}|{Printable} { - onSymbolMatched(yytext().toLowerCase(), yychar); + onSymbolMatched(yytext(), yychar); return yystate(); } @@ -104,7 +114,7 @@ Printable = [\@\$\%\^\&\-+=\?\.\:] else yybegin(YYINITIAL); yypushback(1); - onSymbolMatched(yytext().toLowerCase(), yychar); + onSymbolMatched(yytext(), yychar); return yystate(); } [^] { yybegin(YYINITIAL); yypushback(1); } @@ -114,7 +124,7 @@ Printable = [\@\$\%\^\&\-+=\?\.\:] " " { if (modeFound) yybegin(NAME); } {Identifier}|{Number}|{Printable} { modeFound = true; - onSymbolMatched(yytext().toLowerCase(), yychar); + onSymbolMatched(yytext(), yychar); return yystate(); } [^] { yybegin(YYINITIAL); yypushback(1); } @@ -129,7 +139,7 @@ Printable = [\@\$\%\^\&\-+=\?\.\:] } {Identifier}|{Number}|{Printable} { nameFound = true; - onSymbolMatched(yytext().toLowerCase(), yychar); + onSymbolMatched(yytext(), yychar); return yystate(); } [^\n] { yybegin(YYINITIAL); yypushback(1); } @@ -141,7 +151,7 @@ Printable = [\@\$\%\^\&\-+=\?\.\:] String t = yytext(); if (t.equals("end") && !b64) { yybegin(YYINITIAL); - onSymbolMatched(yytext().toLowerCase(), yychar); + onSymbolMatched(yytext(), yychar); return yystate(); } else if (t.equals("====") && b64) yybegin(YYINITIAL); diff --git a/src/org/opensolaris/opengrok/configuration/Configuration.java b/src/org/opensolaris/opengrok/configuration/Configuration.java index c8be76452dc..c51d33f5e89 100644 --- a/src/org/opensolaris/opengrok/configuration/Configuration.java +++ b/src/org/opensolaris/opengrok/configuration/Configuration.java @@ -197,6 +197,7 @@ public final class Configuration { private LuceneLockName luceneLocking = LuceneLockName.OFF; private boolean compressXref; private boolean indexVersionedFilesOnly; + private boolean allNonWhitespace; private int indexingParallelism; private int historyParallelism; private int historyRenamedParallelism; @@ -420,6 +421,7 @@ public Configuration() { */ // defaults for an opengrok instance configuration cmds = new HashMap<>(); + //allNonWhitespace is default(boolean) setAllowedSymlinks(new HashSet<>()); setAuthorizationWatchdogEnabled(false); //setBugPage("http://bugs.myserver.org/bugdatabase/view_bug.do?bug_id="); @@ -1055,6 +1057,22 @@ public void setIndexVersionedFilesOnly(boolean indexVersionedFilesOnly) { this.indexVersionedFilesOnly = indexVersionedFilesOnly; } + /** + * Gets a value indicating if all non-whitespace should be indexed for + * FULL search. Default is false. + */ + public boolean isAllNonWhitespace() { + return allNonWhitespace; + } + + /** + * Sets a value indicating if all non-whitespace should be indexed for + * FULL search. + */ + public void setAllNonWhitespace(boolean value) { + this.allNonWhitespace = value; + } + public int getIndexingParallelism() { return indexingParallelism; } diff --git a/src/org/opensolaris/opengrok/configuration/RuntimeEnvironment.java b/src/org/opensolaris/opengrok/configuration/RuntimeEnvironment.java index 48c50b19fbe..78c06b37417 100644 --- a/src/org/opensolaris/opengrok/configuration/RuntimeEnvironment.java +++ b/src/org/opensolaris/opengrok/configuration/RuntimeEnvironment.java @@ -155,6 +155,13 @@ public final class RuntimeEnvironment { */ private Short contextSurround; + /** + * Stores a transient value when + * {@link #setAllNonWhitespace(java.lang.Boolean)} is called -- i.e. the + * value is not mediated to {@link Configuration}. + */ + private Boolean allNonWhitespace; + private static final IndexTimestamp indexTime = new IndexTimestamp(); /** @@ -1133,6 +1140,29 @@ public int getHistoryRenamedParallelism() { parallelism; } + /** + * Gets a value indicating if all non-whitespace should be indexed for + * FULL search: either the last value passed to + * {@link #setAllNonWhitespace(java.lang.Boolean)} or + * {@link Configuration#isAllNonWhitespace()} as a default. + */ + public boolean isAllNonWhitespace() { + return allNonWhitespace != null ? allNonWhitespace : + threadConfig.get().isAllNonWhitespace(); + } + + /** + * Sets a value indicating if all non-whitespace should be indexed for + * FULL search, or resets to use {@link Configuration#isAllNonWhitespace()}. + *

+ * N.b. the value is not mediated to {@link Configuration}. + * @param value a defined value or {@code null} to reset to use the + * {@link Configuration#isAllNonWhitespace()} + */ + public void setAllNonWhitespace(Boolean value) { + allNonWhitespace = value; + } + public boolean isTagsEnabled() { return threadConfig.get().isTagsEnabled(); } diff --git a/src/org/opensolaris/opengrok/index/IndexDatabase.java b/src/org/opensolaris/opengrok/index/IndexDatabase.java index a124c51372a..65f96911db5 100644 --- a/src/org/opensolaris/opengrok/index/IndexDatabase.java +++ b/src/org/opensolaris/opengrok/index/IndexDatabase.java @@ -413,7 +413,8 @@ public void update(IndexerParallelizer parallelizer) IOException finishingException = null; try { - Analyzer analyzer = AnalyzerGuru.getAnalyzer(); + FileAnalyzer analyzer = AnalyzerGuru.getAnalyzer(); + analyzer.setAllNonWhitespace(env.isAllNonWhitespace()); IndexWriterConfig iwc = new IndexWriterConfig(analyzer); iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); iwc.setRAMBufferSizeMB(env.getRamBufferSize()); diff --git a/src/org/opensolaris/opengrok/index/Indexer.java b/src/org/opensolaris/opengrok/index/Indexer.java index 74178a049bc..9579615a985 100644 --- a/src/org/opensolaris/opengrok/index/Indexer.java +++ b/src/org/opensolaris/opengrok/index/Indexer.java @@ -419,6 +419,10 @@ public static String[] parseOptions(String[] argv) throws ParseException { helpDetailed = true; }); + parser.on("--allNonWhitespace", "=on|off", ON_OFF, Boolean.class, + "Index all non-whitespace for FULL queries. Default is off."). + Do(v -> cfg.setAllNonWhitespace((Boolean)v)); + parser.on( "-A (.ext|prefix.):(-|analyzer)", "--analyzer", "/(\\.\\w+|\\w+\\.):(-|[a-zA-Z_0-9.]+)/", "Files with the named prefix/extension should be analyzed", diff --git a/src/org/opensolaris/opengrok/search/DefaultQueryEscaper.lex b/src/org/opensolaris/opengrok/search/DefaultQueryEscaper.lex new file mode 100644 index 00000000000..f1f28a5deed --- /dev/null +++ b/src/org/opensolaris/opengrok/search/DefaultQueryEscaper.lex @@ -0,0 +1,58 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2018, Chris Fraire . + */ + +package org.opensolaris.opengrok.search; + +%% +%public +%class DefaultQueryEscaper +%extends TermEscaperBase +%unicode +%type boolean +%eofval{ + return false; +%eofval} + +%include QueryEscaper.lexh +%% + +{LuceneSpecialEscape} { + for (int i = 0; i < yylength(); ++i) { + out.append(yycharat(i)); // faster than yytext() + } +} + +/* + * Other fields shouldn't use qualified terms, so escape colons so that we can + * search for them. + */ + +":" { + out.append("\\:"); +} + +[^] { + for (int i = 0; i < yylength(); ++i) { + out.append(yycharat(i)); // faster than yytext() + } +} diff --git a/src/org/opensolaris/opengrok/search/FullQueryEscaper.lex b/src/org/opensolaris/opengrok/search/FullQueryEscaper.lex new file mode 100644 index 00000000000..d7c8c95ebed --- /dev/null +++ b/src/org/opensolaris/opengrok/search/FullQueryEscaper.lex @@ -0,0 +1,58 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2018, Chris Fraire . + */ + +package org.opensolaris.opengrok.search; + +%% +%public +%class FullQueryEscaper +%extends TermEscaperBase +%unicode +%type boolean +%eofval{ + return false; +%eofval} + +%include QueryEscaper.lexh +%% + +{LuceneSpecialEscape} { + for (int i = 0; i < yylength(); ++i) { + out.append(yycharat(i)); // faster than yytext() + } +} + +/* + * The free text field may contain terms qualified with other field names, so we + * don't escape single colons. + */ + +"::" { + out.append("\\:\\:"); +} + +[^] { + for (int i = 0; i < yylength(); ++i) { + out.append(yycharat(i)); // faster than yytext() + } +} diff --git a/src/org/opensolaris/opengrok/search/PathQueryEscaper.lex b/src/org/opensolaris/opengrok/search/PathQueryEscaper.lex new file mode 100644 index 00000000000..42f577ef789 --- /dev/null +++ b/src/org/opensolaris/opengrok/search/PathQueryEscaper.lex @@ -0,0 +1,61 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2018, Chris Fraire . + */ + +package org.opensolaris.opengrok.search; + +%% +%public +%class PathQueryEscaper +%extends TermEscaperBase +%unicode +%type boolean +%eofval{ + return false; +%eofval} + +%include QueryEscaper.lexh +%% + +{LuceneSpecialEscape} { + for (int i = 0; i < yylength(); ++i) { + out.append(yycharat(i)); // faster than yytext() + } +} + +":" { + out.append("\\:"); +} + +/* + * Workaround for replacing / with escaped / -- needed since lucene 4.x. + */ + +"/" { + out.append("\\/"); +} + +[^] { + for (int i = 0; i < yylength(); ++i) { + out.append(yycharat(i)); // faster than yytext() + } +} diff --git a/src/org/opensolaris/opengrok/search/QueryBuilder.java b/src/org/opensolaris/opengrok/search/QueryBuilder.java index 94508037e9b..8e8802ca974 100644 --- a/src/org/opensolaris/opengrok/search/QueryBuilder.java +++ b/src/org/opensolaris/opengrok/search/QueryBuilder.java @@ -25,6 +25,7 @@ package org.opensolaris.opengrok.search; import java.io.File; +import java.io.StringReader; import java.nio.charset.StandardCharsets; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; @@ -411,21 +412,26 @@ private String getQueryText(String field) { * @return the escaped query string */ private String escapeQueryString(String field, String query) { + StringReader reader = new StringReader(query); + StringBuilder res = new StringBuilder(); switch (field) { case FULL: - // The free text field may contain terms qualified with other - // field names, so we don't escape single colons. - return query.replace("::", "\\:\\:"); + FullQueryEscaper fesc = new FullQueryEscaper(reader); + fesc.setOut(res); + fesc.consume(); + break; case PATH: - // workaround for replacing / with escaped / - needed since lucene 4.x - if (!(query.startsWith("/") && query.endsWith("/"))) { - return (query.replace(":", "\\:")).replace("/", "\\/"); - } - // Other fields shouldn't use qualified terms, so escape colons - // so that we can search for them. + PathQueryEscaper pesc = new PathQueryEscaper(reader); + pesc.setOut(res); + pesc.consume(); + break; default: - return query.replace(":", "\\:"); + DefaultQueryEscaper desc = new DefaultQueryEscaper(reader); + desc.setOut(res); + desc.consume(); + break; } + return res.toString(); } /** diff --git a/src/org/opensolaris/opengrok/search/QueryEscaper.lexh b/src/org/opensolaris/opengrok/search/QueryEscaper.lexh new file mode 100644 index 00000000000..45ee1dd7387 --- /dev/null +++ b/src/org/opensolaris/opengrok/search/QueryEscaper.lexh @@ -0,0 +1,24 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2018, Chris Fraire . + */ + +LuceneSpecialEscape = \\[\+\-\&\|\!\(\)\{\}\[\]\^\"\~\*\?\:\\] diff --git a/src/org/opensolaris/opengrok/search/SearchEngine.java b/src/org/opensolaris/opengrok/search/SearchEngine.java index 4ca46e35bb7..e5b3a3038f5 100644 --- a/src/org/opensolaris/opengrok/search/SearchEngine.java +++ b/src/org/opensolaris/opengrok/search/SearchEngine.java @@ -541,11 +541,13 @@ public void results(int start, int end, List ret) { } } else { LOGGER.log(Level.WARNING, "Unknown genre: {0} for {1}", new Object[]{genre, filename}); - hasContext |= sourceContext.getContext(null, null, null, null, filename, tags, false, false, ret, scopes); + hasContext |= sourceContext.getContextHits(ret, + filename, tags, scopes); } } catch (FileNotFoundException exp) { LOGGER.log(Level.WARNING, "Couldn''t read summary from {0} ({1})", new Object[]{filename, exp.getMessage()}); - hasContext |= sourceContext.getContext(null, null, null, null, filename, tags, false, false, ret, scopes); + hasContext |= sourceContext.getContextHits(ret, + filename, tags, scopes); } } if (historyContext != null) { diff --git a/src/org/opensolaris/opengrok/search/TermEscaperBase.java b/src/org/opensolaris/opengrok/search/TermEscaperBase.java new file mode 100644 index 00000000000..61b13b214b9 --- /dev/null +++ b/src/org/opensolaris/opengrok/search/TermEscaperBase.java @@ -0,0 +1,72 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2018, Chris Fraire . + */ + +package org.opensolaris.opengrok.search; + +import java.io.IOException; + +/** + * Represents an abstract base class for OpenGrok query building term + * transformers. + */ +public abstract class TermEscaperBase { + + protected StringBuilder out; + + /** + * @return the target + */ + public StringBuilder getOut() { + return out; + } + + /** + * @param out the target to set + */ + public void setOut(StringBuilder out) { + this.out = out; + } + + /** + * Call {@link #yylex()} until {@code false}, which consumes all input so + * that {@link #getOut()} contains the entire transformation. + */ + public void consume() { + try { + while (yylex()) { /* noop */ } + } catch (IOException ex) { + // cannot get here with StringBuilder operations + } + } + + /** + * "Runs the scanner [as documented by JFlex]. + *

[The method] can be used to get the next token from the input." + *

"Consume[s] input until one of the expressions in the specification + * is matched or an error occurs." + * @return a value returned by the lexer specification if defined or the + * {@code EOF} value upon reading end-of-file + * @throws IOException if an error occurs reading the input + */ + public abstract boolean yylex() throws IOException; +} diff --git a/src/org/opensolaris/opengrok/search/context/Context.java b/src/org/opensolaris/opengrok/search/context/Context.java index e080f975264..7e2bf775fef 100644 --- a/src/org/opensolaris/opengrok/search/context/Context.java +++ b/src/org/opensolaris/opengrok/search/context/Context.java @@ -48,6 +48,7 @@ import org.opensolaris.opengrok.analysis.FileAnalyzer; import org.opensolaris.opengrok.analysis.Scopes; import org.opensolaris.opengrok.analysis.Scopes.Scope; +import org.opensolaris.opengrok.analysis.TagDesc; import org.opensolaris.opengrok.analysis.plain.PlainAnalyzerFactory; import org.opensolaris.opengrok.configuration.RuntimeEnvironment; import org.opensolaris.opengrok.logger.LoggerFactory; @@ -67,6 +68,7 @@ public class Context { private char[] buffer; PlainLineTokenizer tokens; String queryAsURI; + private boolean alt = true; /** * Map whose keys tell which fields to look for in the source file, and @@ -83,8 +85,8 @@ public class Context { /** * Initializes a context generator for matchers derived from the specified - * {@code query} -- which might be {@code null} and result in - * {@link #isEmpty()} returning {@code true}. + * {@code query} -- which might be none and result in {@link #isEmpty()} + * equal to true. * @param query the query to generate the result for * @param qbuilder required builder used to create {@code query} */ @@ -112,6 +114,11 @@ public void toggleAlt() { alt = !alt; } + /** + * Gets a value indicating if no matchers were derived from the initialized + * {@link Query}. + * @return {@code true} if no matchers were derived + */ public boolean isEmpty() { return m == null; } @@ -181,6 +188,7 @@ public boolean getContext2(RuntimeEnvironment env, IndexSearcher searcher, */ PlainAnalyzerFactory fac = PlainAnalyzerFactory.DEFAULT_INSTANCE; FileAnalyzer anz = fac.getAnalyzer(); + anz.setAllNonWhitespace(env.isAllNonWhitespace()); String path = doc.get(QueryBuilder.PATH); String pathE = Util.URIEncodePath(path); @@ -256,117 +264,68 @@ private void buildQueryAsURI(Map subqueries) { queryAsURI = sb.toString(); } - private boolean alt = true; - + /** + * Calls + * {@link #getContext(java.io.Reader, java.io.Writer, java.lang.String, java.lang.String, java.lang.String, org.opensolaris.opengrok.analysis.Definitions, boolean, boolean, java.util.List, org.opensolaris.opengrok.analysis.Scopes)} + * with {@code in}, {@code out}, {@code urlPrefix}, {@code morePrefix}, + * {@code path}, {@code tags}, {@code limit}, {@code isDefSearch}, + * {@code hits}, and {@code null}. + * @param in required input stream to be matched + * @param out optional output stream to write + * @param urlPrefix prefix for links + * @param morePrefix to link to more... page + * @param path path of the file + * @param tags code definitions. + * @param limit should the number of matching lines be limited? + * @param isDefSearch a value indicating whether to always print matched + * contexts or only when {@link Definitions} tags apply to a line + * @param hits optional instance + * @return Did it get any matching context? + */ public boolean getContext(Reader in, Writer out, String urlPrefix, String morePrefix, String path, Definitions tags, boolean limit, boolean isDefSearch, List hits) { return getContext(in, out, urlPrefix, morePrefix, path, tags, limit, isDefSearch, hits, null); } + /** - * ???. - * Closes the given in reader on return. - * - * @param in File to be matched - * @param out to write the context + * Look for context for this instance's initialized query in the specified + * input stream, and output according to the parameters. + * @param in required input stream to be matched (closed on return) + * @param out optional output stream to write + * @param urlPrefix prefix for links * @param morePrefix to link to more... page * @param path path of the file - * @param tags format to highlight defs. + * @param tags code definitions. * @param limit should the number of matching lines be limited? + * @param isDefSearch a value indicating whether to always print matched + * contexts or only when {@link Definitions} tags apply to a line + * @param hits optional instance + * @param scopes optional instance to read * @return Did it get any matching context? */ public boolean getContext(Reader in, Writer out, String urlPrefix, String morePrefix, String path, Definitions tags, boolean limit, boolean isDefSearch, List hits, Scopes scopes) { + + if (in == null) { + throw new IllegalArgumentException("`in' is null"); + } + if (m == null) { IOUtils.close(in); return false; } boolean anything = false; - TreeMap matchingTags = null; + TreeMap matchingTags = null; String urlPrefixE = (urlPrefix == null) ? "" : Util.URIEncodePath(urlPrefix); String pathE = Util.URIEncodePath(path); if (tags != null) { - matchingTags = new TreeMap(); - try { - for (Definitions.Tag tag : tags.getTags()) { - for (int i = 0; i < m.length; i++) { - if (m[i].match(tag.symbol) == LineMatcher.MATCHED) { - String scope = null; - String scopeUrl = null; - if (scopes != null) { - Scope scp = scopes.getScope(tag.line); - scope = scp.getName() + "()"; - scopeUrl = "" + scope + ""; - } - - /* desc[0] is matched symbol - * desc[1] is line number - * desc[2] is type - * desc[3] is matching line; - * desc[4] is scope - */ - String[] desc = { - tag.symbol, - Integer.toString(tag.line), - tag.type, - tag.text, - scope, - }; - if (in == null) { - if (out == null) { - Hit hit = new Hit(path, - Util.htmlize(desc[3]).replace( - desc[0], "" + desc[0] + ""), - desc[1], false, alt); - hits.add(hit); - anything = true; - } else { - out.write(""); - out.write(desc[1]); - out.write(" "); - out.write(Util.htmlize(desc[3]).replace( - desc[0], "" + desc[0] + "")); - out.write(" "); - - if (desc[4] != null) { - out.write("in "); - out.write(desc[4]); - out.write(" "); - } - out.write(""); - out.write(desc[2]); - out.write("
"); - anything = true; - } - } else { - matchingTags.put(tag.line, desc); - } - break; - } - } - } - } catch (Exception e) { - if (hits != null) { - // @todo verify why we ignore all exceptions? - LOGGER.log(Level.WARNING, "Could not get context for " + path, e); - } - } - } - /** - * Just to get the matching tag send a null in - */ - if (in == null) { - return anything; + matchingTags = new TreeMap<>(); + getContextTags(matchingTags, tags, scopes); } + int charsRead = 0; boolean truncated = false; @@ -458,4 +417,74 @@ public boolean getContext(Reader in, Writer out, String urlPrefix, } return anything; } + + /** + * Gets matching, reportable hits from the specified {@code tags} instance. + * @param hits a defined instance to write + * @param path a defined instance to use for hit filenames + * @param tags a defined instance to read + * @param scopes optional scopes instance + * @return {@code true} if any tags were put to {@code hits} + */ + public boolean getContextHits(List hits, String path, + Definitions tags, Scopes scopes) { + + Map matchingTags = new TreeMap<>(); + boolean ret = getContextTags(matchingTags, tags, scopes); + + for (Map.Entry entry : matchingTags.entrySet()) { + TagDesc desc = entry.getValue(); + Hit hit = makeHit(path, desc); + hits.add(hit); + } + return ret; + } + + /** + * Gets matching, reportable tags from the specified {@code tags} instance. + * @param matchingTags a defined instance to write + * @param tags a defined instance to read + * @param scopes optional scopes instance + * @return {@code true} if any tags were put to {@code matchingTags} + */ + public boolean getContextTags(Map matchingTags, + Definitions tags, Scopes scopes) { + + if (m == null) { + return false; + } + + boolean anything = false; + + for (Definitions.Tag tag : tags.getTags()) { + for (LineMatcher m1 : m) { + if (m1.match(tag.symbol) == LineMatcher.MATCHED) { + String scope = null; + if (scopes != null) { + Scope scp = scopes.getScope(tag.line); + scope = scp.getName() + "()"; + } + + TagDesc desc = new TagDesc(tag.symbol, + Integer.toString(tag.line), tag.type, tag.text, scope); + matchingTags.put(tag.line, desc); + anything = true; + break; + } + } + } + + return anything; + } + + /** + * Converts the specified {@code desc} into a {@link Hit}. + * @param path defined instance + * @param desc defined instance + * @return defined instance + */ + private Hit makeHit(String path, TagDesc desc) { + return new Hit(path, Util.htmlize(desc.text).replace(desc.symbol, + "" + desc.symbol + ""), desc.lineno, false, alt); + } } diff --git a/src/org/opensolaris/opengrok/search/context/PlainLineTokenizer.lex b/src/org/opensolaris/opengrok/search/context/PlainLineTokenizer.lex index 6bd11c7c533..a6217cf03db 100644 --- a/src/org/opensolaris/opengrok/search/context/PlainLineTokenizer.lex +++ b/src/org/opensolaris/opengrok/search/context/PlainLineTokenizer.lex @@ -19,6 +19,7 @@ /* * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. + * Portions Copyright (c) 2018, Chris Fraire . */ /** @@ -36,7 +37,7 @@ import org.opensolaris.opengrok.search.Hit; import org.opensolaris.opengrok.web.Util; import org.opensolaris.opengrok.analysis.Scopes; import org.opensolaris.opengrok.analysis.Scopes.Scope; - +import org.opensolaris.opengrok.analysis.TagDesc; %% %public @@ -63,7 +64,7 @@ import org.opensolaris.opengrok.analysis.Scopes.Scope; boolean dumpRest = false; Writer out; String url; - TreeMap tags; + TreeMap tags; boolean prevHi = false; Integer prevLn = null; List hits; @@ -104,11 +105,13 @@ import org.opensolaris.opengrok.analysis.Scopes.Scope; } - public void reInit(char[] buf, int len, Writer out, String url, TreeMap tags, Scopes scopes) { + public void reInit(char[] buf, int len, Writer out, String url, + TreeMap tags, Scopes scopes) { reInit(new CharArrayReader(buf, 0, len), out, url, tags, scopes); } - public void reInit(Reader in, Writer out, String url, TreeMap tags, Scopes scopes) { + public void reInit(Reader in, Writer out, String url, + TreeMap tags, Scopes scopes) { yyreset(in); markedContents.setLength(0); @@ -124,7 +127,7 @@ import org.opensolaris.opengrok.analysis.Scopes.Scope; this.url = url; this.tags = tags; if(this.tags == null) { - this.tags = new TreeMap(); + this.tags = new TreeMap(); } this.scopes = scopes; prevHi = false; @@ -170,8 +173,8 @@ import org.opensolaris.opengrok.analysis.Scopes.Scope; out.write(""); if (prevHi) { out.write(" "); - String[] desc = tags.remove(prevLn); - out.write(desc[2]); + TagDesc desc = tags.remove(prevLn); + out.write(desc.type); out.write(" "); } out.write("
"); @@ -220,8 +223,8 @@ import org.opensolaris.opengrok.analysis.Scopes.Scope; Integer ln = Integer.valueOf(lineNo); boolean hi = tags.containsKey(ln); if (prevHi) { - String[] desc = tags.remove(prevLn); - hit.setTag(desc[2]); + TagDesc desc = tags.remove(prevLn); + hit.setTag(desc.type); } prevHi = hi; prevLn = ln; @@ -341,8 +344,8 @@ import org.opensolaris.opengrok.analysis.Scopes.Scope; if (prevHi) { out.write(" "); - String[] desc = tags.remove(prevLn); - out.write(desc[2]); + TagDesc desc = tags.remove(prevLn); + out.write(desc.type); out.write(" "); } out.write("
"); @@ -350,8 +353,8 @@ import org.opensolaris.opengrok.analysis.Scopes.Scope; formatWithNum(rest, rest+i-1, markedLine); hit.setLine(sb.toString()); if (prevHi) { - String[] desc = tags.remove(prevLn); - hit.setTag(desc[2]); + TagDesc desc = tags.remove(prevLn); + hit.setTag(desc.type); } hits.add(hit); } @@ -362,30 +365,32 @@ import org.opensolaris.opengrok.analysis.Scopes.Scope; if (tags.size() > 0) { if (out != null) { for(Integer rem : tags.keySet()) { - String[] desc = tags.get(rem); + TagDesc desc = tags.get(rem); out.write(""); - out.write(desc[1]); + out.write(desc.lineno); out.write(" "); - out.write(Util.htmlize(desc[3]).replace(desc[0], "" + desc[0] + "")); + out.write(Util.htmlize(desc.text).replace(desc.symbol, "" + + desc.symbol + "")); out.write(" "); - if (desc[4] != null) { + if (desc.scope != null) { out.write(""); - out.write(desc[4]); + out.write(desc.scope); out.write(" "); } out.write(" "); - out.write(desc[2]); + out.write(desc.type); out.write("
"); } } else { for(Integer rem : tags.keySet()) { - String[] desc = tags.get(rem); - hit = new Hit(url, "" + Util.htmlize(desc[3]).replace(desc[0], "" + desc[0] + ""), - desc[1], false, alt); - hit.setTag(desc[2]); + TagDesc desc = tags.get(rem); + hit = new Hit(url, "" + Util.htmlize(desc.text).replace( + desc.symbol, "" + desc.symbol + ""), desc.lineno, + false, alt); + hit.setTag(desc.type); hits.add(hit); } } @@ -436,8 +441,8 @@ Printable = [\@\$\%\^\&\-+=\?\.\:] if(prevHi){ out.write(" "); - String[] desc = tags.remove(prevLn); - out.write(desc[2]); + TagDesc desc = tags.remove(prevLn); + out.write(desc.type); out.write(" "); } out.write("
"); @@ -445,8 +450,8 @@ Printable = [\@\$\%\^\&\-+=\?\.\:] formatWithNum(rest, endPos, markedLine); hit.setLine(sb.toString()); if(prevHi){ - String[] desc = tags.remove(prevLn); - hit.setTag(desc[2]); + TagDesc desc = tags.remove(prevLn); + hit.setTag(desc.type); } hits.add(hit); sb.setLength(0); diff --git a/src/org/opensolaris/opengrok/util/TextTrieMap.java b/src/org/opensolaris/opengrok/util/TextTrieMap.java new file mode 100644 index 00000000000..3fc7443a683 --- /dev/null +++ b/src/org/opensolaris/opengrok/util/TextTrieMap.java @@ -0,0 +1,407 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +// Copyright © 1991-2018 Unicode, Inc. All rights reserved. +// Distributed under the Terms of Use in http://www.unicode.org/copyright.html. +// Portions Copyright (c) 2018, Chris Fraire . +// +// Permission is hereby granted, free of charge, to any person obtaining +// a copy of the Unicode data files and any associated documentation +// (the "Data Files") or Unicode software and any associated documentation +// (the "Software") to deal in the Data Files or Software +// without restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, and/or sell copies of +// the Data Files or Software, and to permit persons to whom the Data Files +// or Software are furnished to do so, provided that either +// (a) this copyright and permission notice appear with all copies +// of the Data Files or Software, or +// (b) this copyright and permission notice appear in associated +// Documentation. +// +// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF +// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT OF THIRD PARTY RIGHTS. +// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS +// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL +// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, +// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER +// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +// PERFORMANCE OF THE DATA FILES OR SOFTWARE. +// +// Except as contained in this notice, the name of a copyright holder +// shall not be used in advertising or otherwise to promote the sale, +// use or other dealings in these Data Files or Software without prior +// written authorization of the copyright holder. + +/* + * ******************************************************************************** + * Copyright (C) 2007-2011, International Business Machines Corporation and others. + * All Rights Reserved. + * ******************************************************************************** + */ +package org.opensolaris.opengrok.util; + +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.ListIterator; + +/** + * TextTrieMap is a trie implementation for supporting + * fast prefix match for the key. + *

+ * OpenGrok's import of this + * + * ICU class strips out its {@code ignoreCase} handling which depends on + * that project's per-character case-folding algorithms that are external to + * this class. OpenGrok users requiring case-insensitive tries should build + * with {@link String} case-folded entries and search for likewise + * {@link String} case-folded entries. + */ +public class TextTrieMap { + + private Node _root = new Node(); + + /** + * Adds the text key and its associated object in this object. + * + * @param text The text. + * @param val The value object associated with the text. + */ + public TextTrieMap put(CharSequence text, V val) { + CharIterator chitr = new CharIterator(text, 0); + _root.add(chitr, val); + return this; + } + + /** + * Gets an iterator of the objects associated with the + * longest prefix matching string key. + * + * @param text The text to be matched with prefixes. + * @return An iterator of the objects associated with + * the longest prefix matching matching key, or null + * if no matching entry is found. + */ + public Iterator get(String text) { + return get(text, 0); + } + + /** + * Gets an iterator of the objects associated with the + * longest prefix matching string key starting at the + * specified position. + * + * @param text The text to be matched with prefixes. + * @param start The start index of of the text + * @return An iterator of the objects associated with the + * longest prefix matching matching key, or null if no + * matching entry is found. + */ + public Iterator get(CharSequence text, int start) { + return get(text, start, null); + } + + public Iterator get(CharSequence text, int start, int[] matchLen) { + LongestMatchHandler handler = new LongestMatchHandler(); + find(text, start, handler); + if (matchLen != null && matchLen.length > 0) { + matchLen[0] = handler.getMatchLength(); + } + return handler.getMatches(); + } + + public void find(CharSequence text, ResultHandler handler) { + find(text, 0, handler); + } + + public void find(CharSequence text, int offset, ResultHandler handler) { + CharIterator chitr = new CharIterator(text, offset); + find(_root, chitr, handler); + } + + private void find(Node node, CharIterator chitr, ResultHandler handler) { + Iterator values = node.values(); + if (values != null) { + if (!handler.handlePrefixMatch(chitr.processedLength(), values)) { + return; + } + } + + Node nextMatch = node.findMatch(chitr); + if (nextMatch != null) { + find(nextMatch, chitr, handler); + } + } + + public static class CharIterator implements Iterator { + private CharSequence _text; + private int _nextIdx; + private int _startIdx; + + private Character _remainingChar; + + CharIterator(CharSequence text, int offset) { + _text = text; + _nextIdx = _startIdx = offset; + } + + /* (non-Javadoc) + * @see java.util.Iterator#hasNext() + */ + @Override + public boolean hasNext() { + if (_nextIdx == _text.length() && _remainingChar == null) { + return false; + } + return true; + } + + /* (non-Javadoc) + * @see java.util.Iterator#next() + */ + @Override + public Character next() { + if (_nextIdx == _text.length() && _remainingChar == null) { + return null; + } + Character next; + if (_remainingChar != null) { + next = _remainingChar; + _remainingChar = null; + } else { + next = _text.charAt(_nextIdx); + _nextIdx++; + } + return next; + } + + /* (non-Javadoc) + * @see java.util.Iterator#remove() + */ + @Override + public void remove() { + throw new UnsupportedOperationException("remove() not supproted"); + } + + public int nextIndex() { + return _nextIdx; + } + + public int processedLength() { + if (_remainingChar != null) { + throw new IllegalStateException("In the middle of surrogate pair"); + } + return _nextIdx - _startIdx; + } + } + + /** + * Callback handler for processing prefix matches used by + * find method. + */ + public interface ResultHandler { + /** + * Handles a prefix key match + * + * @param matchLength Matched key's length + * @param values An iterator of the objects associated with the matched key + * @return Return true to continue the search in the trie, false to quit. + */ + public boolean handlePrefixMatch(int matchLength, Iterator values); + } + + private static class LongestMatchHandler implements ResultHandler { + private Iterator matches = null; + private int length = 0; + + @Override + public boolean handlePrefixMatch(int matchLength, Iterator values) { + if (matchLength > length) { + length = matchLength; + matches = values; + } + return true; + } + + public Iterator getMatches() { + return matches; + } + + public int getMatchLength() { + return length; + } + } + + /** + * Inner class representing a text node in the trie. + */ + private class Node { + private char[] _text; + private List _values; + private List _children; + + private Node() { + } + + private Node(char[] text, List values, List children) { + _text = text; + _values = values; + _children = children; + } + + public Iterator values() { + if (_values == null) { + return null; + } + return _values.iterator(); + } + + public void add(CharIterator chitr, V value) { + StringBuilder buf = new StringBuilder(); + while (chitr.hasNext()) { + buf.append(chitr.next()); + } + add(toCharArray(buf), 0, value); + } + + public Node findMatch(CharIterator chitr) { + if (_children == null) { + return null; + } + if (!chitr.hasNext()) { + return null; + } + Node match = null; + Character ch = chitr.next(); + for (Node child : _children) { + if (ch < child._text[0]) { + break; + } + if (ch == child._text[0]) { + if (child.matchFollowing(chitr)) { + match = child; + } + break; + } + } + return match; + } + + private void add(char[] text, int offset, V value) { + if (text.length == offset) { + _values = addValue(_values, value); + return; + } + + if (_children == null) { + _children = new LinkedList(); + Node child = new Node(subArray(text, offset), addValue(null, value), null); + _children.add(child); + return; + } + + // walk through children + ListIterator litr = _children.listIterator(); + while (litr.hasNext()) { + Node next = litr.next(); + if (text[offset] < next._text[0]) { + litr.previous(); + break; + } + if (text[offset] == next._text[0]) { + int matchLen = next.lenMatches(text, offset); + if (matchLen == next._text.length) { + // full match + next.add(text, offset + matchLen, value); + } else { + // partial match, create a branch + next.split(matchLen); + next.add(text, offset + matchLen, value); + } + return; + } + } + // add a new child to this node + litr.add(new Node(subArray(text, offset), addValue(null, value), null)); + } + + private boolean matchFollowing(CharIterator chitr) { + boolean matched = true; + int idx = 1; + while (idx < _text.length) { + if(!chitr.hasNext()) { + matched = false; + break; + } + Character ch = chitr.next(); + if (ch != _text[idx]) { + matched = false; + break; + } + idx++; + } + return matched; + } + + private int lenMatches(char[] text, int offset) { + int textLen = text.length - offset; + int limit = _text.length < textLen ? _text.length : textLen; + int len = 0; + while (len < limit) { + if (_text[len] != text[offset + len]) { + break; + } + len++; + } + return len; + } + + private void split(int offset) { + // split the current node at the offset + char[] childText = subArray(_text, offset); + _text = subArray(_text, 0, offset); + + // add the Node representing after the offset as a child + Node child = new Node(childText, _values, _children); + _values = null; + + _children = new LinkedList(); + _children.add(child); + } + + private List addValue(List list, V value) { + if (list == null) { + list = new LinkedList(); + } + list.add(value); + return list; + } + } + + private static char[] toCharArray(CharSequence text) { + char[] array = new char[text.length()]; + for (int i = 0; i < array.length; i++) { + array[i] = text.charAt(i); + } + return array; + } + + private static char[] subArray(char[] array, int start) { + if (start == 0) { + return array; + } + char[] sub = new char[array.length - start]; + System.arraycopy(array, start, sub, 0, sub.length); + return sub; + } + + private static char[] subArray(char[] array, int start, int limit) { + if (start == 0 && limit == array.length) { + return array; + } + char[] sub = new char[limit - start]; + System.arraycopy(array, start, sub, 0, limit - start); + return sub; + } +} diff --git a/src/org/opensolaris/opengrok/web/Util.java b/src/org/opensolaris/opengrok/web/Util.java index 565bb347eb9..ca63e65fda3 100644 --- a/src/org/opensolaris/opengrok/web/Util.java +++ b/src/org/opensolaris/opengrok/web/Util.java @@ -20,7 +20,7 @@ /* * Copyright (c) 2005, 2018, Oracle and/or its affiliates. All rights reserved. * Portions Copyright 2011 Jens Elkner. - * Portions Copyright (c) 2017, Chris Fraire . + * Portions Copyright (c) 2017-2018, Chris Fraire . */ package org.opensolaris.opengrok.web; @@ -61,6 +61,7 @@ import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; import javax.servlet.http.HttpServletRequest; +import org.apache.lucene.queryparser.classic.QueryParser; import org.json.simple.JSONArray; import org.json.simple.JSONObject; import org.opensolaris.opengrok.Info; @@ -82,20 +83,8 @@ public final class Util { private static final int BOLD_COUNT_THRESHOLD = 1000; - /** - * Matches a character that is not ASCII alpha-numeric or underscore: - *

-     * {@code
-     * [^A-Za-z0-9_]
-     * }
-     * 
- * (Edit above and paste below [in NetBeans] for easy String escaping.) - */ - private final static Pattern NON_ASCII_ALPHA_NUM = Pattern.compile( - "[^A-Za-z0-9_]"); - + /** Private to enforce static */ private Util() { - // singleton } /** @@ -147,24 +136,15 @@ public static String prehtmlize(CharSequence q) { } /** - * Append to {@code dest} the UTF-8 URL-encoded representation of - * {@code str}, within explicit quotes (%22) to accommodate Lucene querying - * if {@code str} contains any character that is not ASCII-alphanumeric or - * underscore. + * Append to {@code dest} the UTF-8 URL-encoded representation of the + * Lucene-escaped version of {@code str}. * @param str a defined instance * @param dest a defined target * @throws IOException */ public static void qurlencode(String str, Appendable dest) throws IOException { - if (NON_ASCII_ALPHA_NUM.matcher(str).find()) { - final String UQUOTE = "%22"; - dest.append(UQUOTE); - URIEncode(str, dest); - dest.append(UQUOTE); - } else { - URIEncode(str, dest); - } + URIEncode(QueryParser.escape(str), dest); } /** diff --git a/test/org/opensolaris/opengrok/analysis/JFlexXrefTest.java b/test/org/opensolaris/opengrok/analysis/JFlexXrefTest.java index 29e993e2893..3f9eb4cd6d0 100644 --- a/test/org/opensolaris/opengrok/analysis/JFlexXrefTest.java +++ b/test/org/opensolaris/opengrok/analysis/JFlexXrefTest.java @@ -434,7 +434,7 @@ public void truncatedUuencodedFile() throws IOException { assertLinesEqual("UuencodeXref truncated", "1" + "begin 644 " - + "test.txt" + + "test.txt" + "\n" + "2", out.toString()); diff --git a/test/org/opensolaris/opengrok/analysis/c/samplesymbols_c2.txt b/test/org/opensolaris/opengrok/analysis/c/samplesymbols_c2.txt new file mode 100644 index 00000000000..05d617b8e79 --- /dev/null +++ b/test/org/opensolaris/opengrok/analysis/c/samplesymbols_c2.txt @@ -0,0 +1,1091 @@ +/* |1 +* |1 +cddl |1 +header |1 +start |1 +* |1 +* |1 +the |1 +contents |1 +of |1 +this |1 +file |1 +are |1 +subject |1 +to |1 +the |1 +terms |1 +of |1 +the |1 +* |1 +common |1 +development |1 +and |1 +distribution |1 +license |1 +license, |0 +version |1 +1 |1 +1.0 |0 +0 |1 +only |1 +* |1 +(the |1 +the |0 +"license |1 +"license" |0 +"license") |0 +"license"). |0 +license |0 +license" |0 +license") |0 +license"). |0 +you |1 +may |1 +not |1 +use |1 +this |1 +file |1 +except |1 +in |1 +compliance |1 +* |1 +with |1 +the |1 +license |1 +license. |0 +* |1 +* |1 +you |1 +can |1 +obtain |1 +a |1 +copy |1 +of |1 +the |1 +license |1 +at |1 +usr |1 +usr/ |0 +usr/src |0 +usr/src/ |0 +usr/src/opensolaris |0 +usr/src/opensolaris.license |0 +src |1 +src/ |0 +src/opensolaris |0 +src/opensolaris.license |0 +opensolaris |1 +opensolaris.license |0 +license |1 +* |1 +or |1 +http |1 +http:// |0 +http://www |0 +http://www.opensolaris |0 +http://www.opensolaris.org |0 +http://www.opensolaris.org/ |0 +http://www.opensolaris.org/os |0 +http://www.opensolaris.org/os/ |0 +http://www.opensolaris.org/os/licensing |0 +http://www.opensolaris.org/os/licensing. |0 +www |1 +www.opensolaris |0 +www.opensolaris.org |0 +www.opensolaris.org/ |0 +www.opensolaris.org/os |0 +www.opensolaris.org/os/ |0 +www.opensolaris.org/os/licensing |0 +www.opensolaris.org/os/licensing. |0 +opensolaris |1 +opensolaris.org |0 +opensolaris.org/ |0 +opensolaris.org/os |0 +opensolaris.org/os/ |0 +opensolaris.org/os/licensing |0 +opensolaris.org/os/licensing. |0 +org |1 +org/ |0 +org/os |0 +org/os/ |0 +org/os/licensing |0 +org/os/licensing. |0 +os |1 +os/ |0 +os/licensing |0 +os/licensing. |0 +licensing |1 +licensing. |0 +* |1 +see |1 +the |1 +license |1 +for |1 +the |1 +specific |1 +language |1 +governing |1 +permissions |1 +* |1 +and |1 +limitations |1 +under |1 +the |1 +license |1 +license. |0 +* |1 +* |1 +when |1 +distributing |1 +covered |1 +code |1 +code, |0 +include |1 +this |1 +cddl |1 +header |1 +in |1 +each |1 +* |1 +file |1 +and |1 +include |1 +the |1 +license |1 +file |1 +at |1 +usr |1 +usr/ |0 +usr/src |0 +usr/src/ |0 +usr/src/opensolaris |0 +usr/src/opensolaris.license |0 +usr/src/opensolaris.license. |0 +src |1 +src/ |0 +src/opensolaris |0 +src/opensolaris.license |0 +src/opensolaris.license. |0 +opensolaris |1 +opensolaris.license |0 +opensolaris.license. |0 +license |1 +license. |0 +* |1 +if |1 +applicable |1 +applicable, |0 +add |1 +the |1 +following |1 +below |1 +this |1 +cddl |1 +header |1 +header, |0 +with |1 +the |1 +* |1 +fields |1 +enclosed |1 +by |1 +brackets |1 +"[ |1 +"[] |0 +"[]" |0 +[] |0 +[]" |0 +replaced |1 +with |1 +your |1 +own |1 +identifying |1 +* |1 +information |1 +information: |0 +portions |1 +copyright |1 +[yyyy |1 +[yyyy] |0 +yyyy |0 +yyyy] |0 +[name |1 +name |0 +of |1 +copyright |1 +owner |1 +owner] |0 +* |1 +* |1 +cddl |1 +header |1 +end |1 +*/ |1 +/* |1 +* |1 +copyright |1 +2004 |1 +sun |1 +microsystems |1 +microsystems, |0 +inc |1 +inc. |0 +all |1 +rights |1 +reserved |1 +reserved. |0 +* |1 +use |1 +is |1 +subject |1 +to |1 +license |1 +terms |1 +terms. |0 +*/ |1 +#pragma |1 +pragma |0 +ident |1 +"% |1 +"%z |0 +"%z%% |0 +"%z%%m |0 +"%z%%m% |0 +%z |0 +%z%% |0 +%z%%m |0 +%z%%m% |0 +z |0 +z%% |0 +z%%m |0 +z%%m% |0 +m |1 +m% |0 +%i |1 +%i% |0 +i |0 +i% |0 +%e |1 +%e% |0 +e |0 +e% |0 +smi |1 +smi" |0 +#include |1 +include |0 + |0 +sys |0 +sys/ |0 +sys/types |0 +sys/types.h |0 +sys/types.h> |0 +types |1 +types.h |0 +types.h> |0 +h |1 +h> |0 +#include |1 +include |0 +