Skip to content

summarize: Small variable rename and handle engine not found case #131

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Oct 20, 2018
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 58 additions & 41 deletions pythainlp/summarize/__init__.py
Original file line number Diff line number Diff line change
@@ -1,51 +1,68 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import,unicode_literals
from pythainlp.corpus import stopwords
from string import punctuation

from __future__ import absolute_import, unicode_literals

from collections import defaultdict
from pythainlp.tokenize import sent_tokenize, word_tokenize
from heapq import nlargest
from string import punctuation

from pythainlp.corpus import stopwords
from pythainlp.tokenize import sent_tokenize, word_tokenize


class FrequencySummarizer:
def __init__(self, min_cut=0.1, max_cut=0.9):
self._min_cut = min_cut
self._max_cut = max_cut
self._stopwords = set(stopwords.words('thai') + list(punctuation))

def _compute_frequencies(self, word_sent):
freq = defaultdict(int)
for s in word_sent:
for word in s:
if word not in self._stopwords:
freq[word] += 1
m = float(max(freq.values()))
for w in list(freq):
freq[w] = freq[w]/m
if freq[w] >= self._max_cut or freq[w] <= self._min_cut:
del freq[w]
return freq

def _rank(self, ranking, n):
self.__min_cut = min_cut
self.__max_cut = max_cut
self.__stopwords = set(stopwords.words("thai") + list(punctuation))

def __compute_frequencies(self, word_tokenized_sents):
word_freqs = defaultdict(int)
for sent in word_tokenized_sents:
for word in sent:
if word not in self.__stopwords:
word_freqs[word] += 1

max_freq = float(max(word_freqs.values()))
for w in list(word_freqs):
word_freqs[w] = word_freqs[w] / max_freq
if word_freqs[w] >= self.__max_cut or word_freqs[w] <= self.__min_cut:
del word_freqs[w]

return word_freqs

def __rank(self, ranking, n):
return nlargest(n, ranking, key=ranking.get)

def summarize(self, text, n,tokenize):
def summarize(self, text, n, tokenizer):
sents = sent_tokenize(text)
word_sent = [word_tokenize(s,tokenize) for s in sents]
self._freq = self._compute_frequencies(word_sent)
word_tokenized_sents = [word_tokenize(sent, tokenizer) for sent in sents]
self.__freq = self.__compute_frequencies(word_tokenized_sents)
ranking = defaultdict(int)
for i, sent in enumerate(word_sent):

for i, sent in enumerate(word_tokenized_sents):
for w in sent:
if w in self._freq:
ranking[i] += self._freq[w]
sents_idx = self._rank(ranking,n)
return [sents[j] for j in sents_idx]
def summarize_text(text,n,engine='frequency',tokenize='newmm'):
'''
Thai text summarize.
:param str text: thai text
:param int n: sent number
:param str engine: Thai text summarize engine.
:param str tokenize: thai word tokenize.
'''
if engine=='frequency':
data=FrequencySummarizer().summarize(text,n,tokenize)
return data
if w in self.__freq:
ranking[i] += self.__freq[w]
summaries_idx = self.__rank(ranking, n)

return [sents[j] for j in summaries_idx]


def summarize_text(text, n, engine="frequency", tokenizer="newmm"):
"""
Thai text summarization
:param str text: text to be summarized
:param int n: number of sentences to be included in the summary
:param str engine: text summarization engine
:param str tokenizer: word tokenizer
:return List[str] summary: list of selected sentences
"""
sents = []

if engine == "frequency":
sents = FrequencySummarizer().summarize(text, n, tokenizer)
else: # if engine not found, return first n sentences
sents = sent_tokenize(text)[:n]

return sents