Skip to content

Commit c0079d1

Browse files
authored
Merge pull request #4 from adamlerer/rnn
RNN word language model example
2 parents 764ac3b + e66247a commit c0079d1

File tree

5 files changed

+49488
-0
lines changed

5 files changed

+49488
-0
lines changed

word_language_model/data.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
########################################
2+
# Data Fetching Script for PTB
3+
########################################
4+
5+
import torch
6+
import os.path
7+
8+
class Dictionary(object):
9+
def __init__(self):
10+
self.word2idx = {}
11+
self.idx2word = []
12+
13+
def addword(self, word):
14+
if word not in self.word2idx:
15+
self.idx2word.append(word)
16+
self.word2idx[word] = len(self.idx2word) - 1
17+
18+
return self.word2idx[word]
19+
20+
def ntokens(self):
21+
return len(self.idx2word)
22+
23+
24+
class Corpus(object):
25+
def __init__(self, path):
26+
self.dic = Dictionary()
27+
self.train=self._loadfile(os.path.join(path, 'train.txt'))
28+
self.valid=self._loadfile(os.path.join(path, 'valid.txt'))
29+
self.test =self._loadfile(os.path.join(path, 'test.txt'))
30+
31+
# | Tokenize a text file.
32+
def _loadfile(self, path):
33+
# Read words from file.
34+
assert(os.path.exists(path))
35+
tokens = 0
36+
with open(path, 'r') as f:
37+
for line in f:
38+
words = line.split() + ['<eos>']
39+
for word in words:
40+
self.dic.addword(word)
41+
tokens += 1
42+
43+
with open(path, 'r') as f:
44+
ids = torch.LongTensor(tokens)
45+
token = 0
46+
for line in f:
47+
words = line.split() + ['<eos>']
48+
for word in words:
49+
ids[token] = self.dic.word2idx[word]
50+
token += 1
51+
52+
# Final dataset.
53+
return ids

0 commit comments

Comments
 (0)