Skip to content

Add Czech stemmer #22

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ This crate implements some stemmer algorithms found in the [snowball project](ht

- Arabic
- Armenian
- Czech
- Danish
- Dutch
- English
Expand Down
255 changes: 255 additions & 0 deletions algorithms/czech.sbl
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
/*
* Czech language stemmer
*
* Source obtained from https://snowballstem.org/algorithms/czech/stemmer.html,
* created by Ljiljana Dolamic & Jacques Savoy in 2009 (thank you :)
*/

routines (
RV R1
palatalise
mark_regions
do_possessive
do_case
do_comparative
do_diminutive
do_augmentative
do_derivational
do_deriv_single
do_aggressive
)

externals ( stem )

integers ( pV p1 )

groupings ( v )

stringescapes {}

stringdef a' '{U+00E1}'
stringdef c^ '{U+010D}'
stringdef d^ '{U+010F}'
stringdef e' '{U+00E9}'
stringdef e^ '{U+011B}'
stringdef i' '{U+00ED}'
stringdef n^ '{U+0148}'
stringdef o' '{U+00F3}'
stringdef r^ '{U+0159}'
stringdef s^ '{U+0161}'
stringdef t^ '{U+0165}'
stringdef u' '{U+00FA}'
stringdef u* '{U+016F}'
stringdef y' '{U+00FD}'
stringdef z^ '{U+017E}'

define v 'aeiouy{a'}{e^}{e'}{i'}{o'}{u'}{u*}{y'}'

define mark_regions as (

$pV = limit
$p1 = limit

do (
gopast non-v setmark pV
gopast non-v gopast v setmark p1
)
)

backwardmode (

define RV as $pV <= cursor
define R1 as $p1 <= cursor

define palatalise as (
[substring] RV among (
'ci' 'ce' '{c^}i' '{c^}'
(<- 'k')
'zi' 'ze' '{z^}i' '{z^}e'
(<- 'h')
'{c^}t{e^}' '{c^}ti' '{c^}t{e'}'
(<- 'ck')
'{s^}t{e^}' '{s^}ti' '{s^}t{e'}'
(<- 'sk')
)
)

define do_possessive as (
[substring] RV among (
'ov' '{u*}v'
(delete)
'in'
(
delete
try palatalise
)
)
)

define do_case as (
[substring] among (
'atech'
'{e^}tem' 'at{u*}m'
'{a'}ch' '{y'}ch' 'ov{e'}' '{y'}mi'
'ata' 'aty' 'ama' 'ami' 'ovi'
'at' '{a'}m' 'os' 'us' '{y'}m' 'mi' 'ou'
'u' 'y' '{u*}' 'a' 'o' '{a'}' '{e'}' '{y'}'
(delete)
'ech' 'ich' '{i'}ch'
'{e'}ho' '{e^}mi' '{e'}mu' '{e^}te' '{e^}ti' '{i'}ho' '{i'}mi'
'emi' 'iho' 'imu'
'{e'}m' '{i'}m' 'es'
'e' 'i' '{i'}' '{e^}'
(
delete
try palatalise
)
'em'
(
<- 'e'
try palatalise
)
)
)

define do_derivational as (
[substring] R1 among (
'obinec'
'ovisk' 'ovstv' 'ovi{s^}t' 'ovn{i'}k'
'{a'}sek' 'loun' 'nost' 'teln' 'ovec' 'ov{i'}k' 'ovtv' 'ovin' '{s^}tin'
'{a'}rn' 'och' 'ost' 'ovn' 'oun' 'out' 'ou{s^}' 'u{s^}k'
'kyn' '{c^}an' 'k{a'}{r^}' 'n{e'}{r^}' 'n{i'}k' 'ctv' 'stv'
'{a'}{c^}' 'a{c^}' '{a'}n' 'an' '{a'}{r^}' 'as'
'ob' 'ot' 'ov' 'o{n^}' 'ul' 'yn'
'{c^}k' '{c^}n' 'dl' 'nk' 'tv' 'tk' 'vk'
(delete)
'ion{a'}{r^}'
'inec' 'itel'
'i{a'}n' 'ist' 'isk' 'i{s^}k' 'itb'
'ic' 'in' 'it' 'iv'
(
<- 'i'
palatalise
)
'enic' 'ec' 'en'
(
<- 'e'
palatalise
)
'{e'}{r^}'
(
<- '{e'}'
palatalise
)
'{e^}n'
(
<- '{e^}'
palatalise
)
'{i'}rn'
'{i'}{r^}' '{i'}n'
(
<- '{i'}'
palatalise
)
)
)
define do_deriv_single as (
[substring] among (
'c' '{c^}' 'k' 'l' 'n' 't'
(delete)
)
)


define do_augmentative as (
[substring] among (
'ajzn' '{a'}k'
(delete)
'izn' 'isk'
(
<- 'i'
palatalise
)
)
)

define do_diminutive as (
[substring] among (
'ou{s^}ek' '{a'}{c^}ek' 'a{c^}ek' 'o{c^}ek' 'u{c^}ek'
'anek' 'onek' 'unek' '{a'}nek'
'e{c^}k' '{e'}{c^}k' 'i{c^}k' '{i'}{c^}k' 'enk' '{e'}nk' 'ink' '{i'}nk'
'{a'}{c^}k' 'a{c^}k' 'o{c^}k' 'u{c^}k' 'ank' 'onk' 'unk'
'{a'}tk' '{a'}nk' 'u{s^}k'
'k'
(delete)
'e{c^}ek' 'enek' 'ek'
(
<- 'e'
palatalise
)
'{e'}{c^}ek' '{e'}k'
(
<- '{e'}'
palatalise
)
'i{c^}ek' 'inek' 'ik'
(
<- 'i'
palatalise
)
'{i'}{c^}ek' '{i'}k'
(
<- '{i'}'
palatalise
)
'{a'}k'
(<- '{a'}')
'ak'
(<- 'a')
'ok'
(<- 'o')
'uk'
(<- 'u')
)
)

define do_comparative as (
[substring] among (
'{e^}j{s^}'
(
<- '{e^}'
palatalise
)
'ej{s^}'
(
<- 'e'
palatalise
)
)
)

define do_aggressive as (
do do_comparative
do do_diminutive
do do_augmentative
do_derivational or do_deriv_single
)
)

define stem as (
do mark_regions
backwards (
do_case
do_possessive
// light and aggressive are the same to this point
// comment next line for light stemmer
do_aggressive
)
)

// Ljiljana Dolamic and Jacques Savoy. 2009.
// Indexing and stemming approaches for the Czech language.
// Inf. Process. Manage. 45, 6 (November 2009), 714-720.
// http://members.unine.ch/jacques.savoy/clef/CzechStemmerLight.txt
// http://members.unine.ch/jacques.savoy/clef/CzechStemmerAgressive.txt
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ use snowball::algorithms;
pub enum Algorithm {
Arabic,
Armenian,
Czech,
Danish,
Dutch,
English,
Expand Down Expand Up @@ -67,6 +68,7 @@ impl Stemmer {
match lang {
Algorithm::Arabic => Stemmer { stemmer: algorithms::arabic::stem },
Algorithm::Armenian => Stemmer { stemmer: algorithms::armenian::stem },
Algorithm::Czech => Stemmer { stemmer: algorithms::czech::stem },
Algorithm::Danish => Stemmer { stemmer: algorithms::danish::stem },
Algorithm::Dutch => Stemmer { stemmer: algorithms::dutch::stem },
Algorithm::English => Stemmer { stemmer: algorithms::english::stem },
Expand Down
Loading