Skip to content

Feature/tokenizer mode #2104

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions OpenGrok
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ Supported Environment Variables for configuring the default setup:
- OPENGROK_PROGRESS Shows progress in %(percentage) of working
through project. It's good to have Verbose
Mode enabled too. (*)
- OPENGROK_ALL_NONWHITESPACE Index all non-whitespace for FULL queries.
on|off (default off) (^)
- OPENGROK_RENAMED_FILES_HISTORY Get full history of renamed files for SCMs
that support it (Git, Mercurial). When set to
on, the indexing is slower, especially in the
Expand Down Expand Up @@ -450,6 +452,12 @@ ${BZR:+-Dorg.opensolaris.opengrok.history.Bazaar=$BZR} \
ASSIGNMENTS="`echo $OPENGROK_ASSIGNMENTS | sed 's/[:space:]+/_/g'`"
ASSIGNMENTS="-A `echo $ASSIGNMENTS | sed 's/,/ -A /g'`"
fi

OPENGROK_ALL_NONWHITESPACE="${OPENGROK_ALL_NONWHITESPACE:-off}"
case "$OPENGROK_ALL_NONWHITESPACE" in
on|true|1) ALL_NONWHITESPACE="--allNonWhitespace on" ;;
*) ALL_NONWHITESPACE="" ;;
esac
}

#
Expand Down Expand Up @@ -898,6 +906,7 @@ CommonInvocation()
${OPENGROK_FLUSH_RAM_BUFFER_SIZE} ${SKIN} ${LEADING_WILDCARD} \
${OPENGROK_PARALLELISM:+--threads} ${OPENGROK_PARALLELISM} \
${ASSIGNMENTS} \
${ALL_NONWHITESPACE} \
${READ_XML_CONF} \
${WEBAPP_CONFIG} \
${OPENGROK_PROFILER:+--profiler} \
Expand Down
7 changes: 6 additions & 1 deletion build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,12 @@ Portions Copyright (c) 2017-2018, Chris Fraire <[email protected]>.
<run-jflex dir="${gen.analysis.dir}/uue" name="UuencodeFullTokenizer"/>
<run-jflex dir="${gen.analysis.dir}/pascal" name="PascalSymbolTokenizer"/>
<run-jflex dir="${gen.analysis.dir}/pascal" name="PascalXref"/>


<property name="gen.search.dir" value="/org/opensolaris/opengrok/search"/>
<run-jflex dir="${gen.search.dir}" name="DefaultQueryEscaper"/>
<run-jflex dir="${gen.search.dir}" name="FullQueryEscaper"/>
<run-jflex dir="${gen.search.dir}" name="PathQueryEscaper"/>

<property name="gen.context.dir" value="/org/opensolaris/opengrok/search/context"/>
<run-jflex dir="${gen.context.dir}" name="HistoryLineTokenizer"/>
<run-jflex dir="${gen.context.dir}" name="PlainLineTokenizer"/>
Expand Down
7 changes: 7 additions & 0 deletions opengrok-indexer/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,13 @@ Portions Copyright (c) 2017-2018, Chris Fraire <[email protected]>.
<exclude>*.java</exclude>
</excludes>
</testResource>
<testResource>
<targetPath>org/opensolaris/opengrok/analysis/plain/</targetPath>
<directory>../test/org/opensolaris/opengrok/analysis/plain/</directory>
<excludes>
<exclude>*.java</exclude>
</excludes>
</testResource>
<testResource>
<targetPath>org/opensolaris/opengrok/analysis/powershell/</targetPath>
<directory>../test/org/opensolaris/opengrok/analysis/powershell/</directory>
Expand Down
38 changes: 25 additions & 13 deletions src/org/opensolaris/opengrok/analysis/CompatibleAnalyser.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
* CDDL HEADER END
*/

/*
/*
* Copyright (c) 2005, 2017, Oracle and/or its affiliates. All rights reserved.
* Portions Copyright (c) 2017, Chris Fraire <[email protected]>.
* Portions Copyright (c) 2017-2018, Chris Fraire <[email protected]>.
*/
package org.opensolaris.opengrok.analysis;

Expand All @@ -40,29 +40,30 @@ public CompatibleAnalyser() {
protected TokenStreamComponents createComponents(String fieldName) {
switch (fieldName) {
case QueryBuilder.FULL:
return new TokenStreamComponents(createPlainFullTokenizer());
return new TokenStreamComponents(
createNonWhitespaceFullTokenizer());
case QueryBuilder.REFS:
return new TokenStreamComponents(createPlainSymbolTokenizer());
return new TokenStreamComponents(
createNonWhitespaceSymbolTokenizer());
case QueryBuilder.DEFS:
return new TokenStreamComponents(createPlainSymbolTokenizer());
return new TokenStreamComponents(
createNonWhitespaceSymbolTokenizer());
case QueryBuilder.PATH:
case QueryBuilder.PROJECT:
return new TokenStreamComponents(new PathTokenizer());
case QueryBuilder.HIST:
return new HistoryAnalyzer().createComponents(fieldName);
default:
return new TokenStreamComponents(createPlainFullTokenizer());
return new TokenStreamComponents(
createPlainFullTokenizer(TokenizerMode.SYMBOLS_ONLY));
}
}

private JFlexTokenizer createPlainSymbolTokenizer() {
return new JFlexTokenizer(new PlainSymbolTokenizer(
FileAnalyzer.dummyReader));
}

private JFlexTokenizer createPlainFullTokenizer() {
return new JFlexTokenizer(new PlainFullTokenizer(
private JFlexTokenizer createPlainFullTokenizer(TokenizerMode mode) {
JFlexTokenizer tokenizer = new JFlexTokenizer(new PlainFullTokenizer(
FileAnalyzer.dummyReader));
tokenizer.setTokenizerMode(mode);
return tokenizer;
}

@Override
Expand All @@ -75,4 +76,15 @@ protected TokenStream normalize(String fieldName, TokenStream in) {
return new LowerCaseFilter(in);
}
}

private JFlexTokenizer createNonWhitespaceFullTokenizer() {
return createPlainFullTokenizer(TokenizerMode.NON_WHITESPACE_ONLY);
}

private JFlexTokenizer createNonWhitespaceSymbolTokenizer() {
JFlexTokenizer tokenizer = new JFlexTokenizer(new PlainSymbolTokenizer(
FileAnalyzer.dummyReader));
tokenizer.setTokenizerMode(TokenizerMode.NON_WHITESPACE_ONLY);
return tokenizer;
}
}
16 changes: 15 additions & 1 deletion src/org/opensolaris/opengrok/analysis/FileAnalyzer.java
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ public class FileAnalyzer extends Analyzer {
protected Ctags ctags;
protected boolean scopesEnabled;
protected boolean foldingEnabled;
protected boolean allNonWhitespace;
private final FileAnalyzerFactory factory;

/**
Expand Down Expand Up @@ -169,6 +170,14 @@ public void setFoldingEnabled(boolean foldingEnabled) {
this.foldingEnabled = supportsScopes() && foldingEnabled;
}

/**
* Sets a value indicating if all non-whitespace should be indexed for
* FULL search. Default is false.
*/
public void setAllNonWhitespace(boolean value) {
this.allNonWhitespace = value;
}

protected boolean supportsScopes() {
return false;
}
Expand Down Expand Up @@ -319,8 +328,13 @@ private JFlexTokenizer createPlainSymbolTokenizer() {
}

private JFlexTokenizer createPlainFullTokenizer() {
return new JFlexTokenizer(new PlainFullTokenizer(
JFlexTokenizer tokenizer = new JFlexTokenizer(new PlainFullTokenizer(
FileAnalyzer.dummyReader));
tokenizer.setTokenizerModeSupplier(() -> {
return allNonWhitespace ? TokenizerMode.SYMBOLS_AND_NON_WHITESPACE :
TokenizerMode.SYMBOLS_ONLY;
});
return tokenizer;
}

@Override
Expand Down
69 changes: 64 additions & 5 deletions src/org/opensolaris/opengrok/analysis/JFlexSymbolMatcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,16 @@ public abstract class JFlexSymbolMatcher extends JFlexStateStacker
private NonSymbolMatchedListener nonSymbolListener;
private String disjointSpanClassName;

/**
* Gets a value indicating if the matcher is by-default case-insensitive --
* i.e. whether tokens should be lower-cased when published in a stream.
* @return {@code false} but subclasses should override where necessary
*/
@Override
public boolean isDefaultCaseInsensitive() {
return false;
}

/**
* Associates the specified listener, replacing the former one.
* @param l defined instance
Expand Down Expand Up @@ -78,6 +88,27 @@ public void clearNonSymbolMatchedListener() {
nonSymbolListener = null;
}

/**
* Does nothing. Subclasses can override if necessary to alter their
* behavior for different modes.
*/
@Override
public void setTokenizerMode(TokenizerMode value) {
}

/**
* Does nothing. Subclasses can override to determines if {@code str}
* starts with a contraction (i.e., a word containing letters and non-word
* characters such as "ain't") according to the specific language.
* @param str a defined instance
* @return 0 if {@code str} does not start with a contraction; or else the
* length of the longest initial contraction
*/
@Override
public int getLongestContractionPrefix(String str) {
return 0;
}

/**
* Gets the class name value from the last call to
* {@link #onDisjointSpanChanged(java.lang.String, int)}.
Expand All @@ -103,6 +134,24 @@ protected void onSymbolMatched(String str, int start) {
}
}

/**
* Raises
* {@link SymbolMatchedListener#symbolMatched(org.opensolaris.opengrok.analysis.SymbolMatchedEvent)}
* for a subscribed listener.
* @param literal the literal representation of the symbol
* @param str the symbol string
* @param start the symbol literal start position
*/
protected void onSymbolMatched(String literal, String str, int start) {
SymbolMatchedListener l = symbolListener;
if (l != null) {
// TODO: publish literal through SymbolMatchedEvent.
SymbolMatchedEvent evt = new SymbolMatchedEvent(this, str, start,
start + literal.length());
l.symbolMatched(evt);
}
}

/**
* Raises
* {@link SymbolMatchedListener#sourceCodeSeen(org.opensolaris.opengrok.analysis.SourceCodeSeenEvent)}
Expand All @@ -122,9 +171,11 @@ protected void onSourceCodeSeen(int start) {
* {@link String#valueOf(char)} {@code c} and {@code start}.
* @param c the text character
* @param start the text start position
* @return {@code true} if one or more complete tokens were published from
* the text
*/
protected void onNonSymbolMatched(char c, int start) {
onNonSymbolMatched(String.valueOf(c), start);
protected boolean onNonSymbolMatched(char c, int start) {
return onNonSymbolMatched(String.valueOf(c), start);
}

/**
Expand All @@ -133,14 +184,18 @@ protected void onNonSymbolMatched(char c, int start) {
* for a subscribed listener.
* @param str the text string
* @param start the text start position
* @return {@code true} if one or more complete tokens were published from
* the text
*/
protected void onNonSymbolMatched(String str, int start) {
protected boolean onNonSymbolMatched(String str, int start) {
NonSymbolMatchedListener l = nonSymbolListener;
if (l != null) {
TextMatchedEvent evt = new TextMatchedEvent(this, str, start,
start + str.length());
l.nonSymbolMatched(evt);
return evt.isPublished();
}
return false;
}

/**
Expand All @@ -150,15 +205,19 @@ protected void onNonSymbolMatched(String str, int start) {
* @param str the text string
* @param hint the text hint
* @param start the text start position
* @return {@code true} if one or more complete tokens were published from
* the text
*/
protected void onNonSymbolMatched(String str, EmphasisHint hint,
int start) {
protected boolean onNonSymbolMatched(String str, EmphasisHint hint,
int start) {
NonSymbolMatchedListener l = nonSymbolListener;
if (l != null) {
TextMatchedEvent evt = new TextMatchedEvent(this, str, hint, start,
start + str.length());
l.nonSymbolMatched(evt);
return evt.isPublished();
}
return false;
}

/**
Expand Down
Loading