Skip to content

Commit a9cd996

Browse files
author
wangyi-yd
committed
done.
1 parent 945de7b commit a9cd996

File tree

3 files changed

+299
-0
lines changed

3 files changed

+299
-0
lines changed

detector/Fixtures/test1.csv

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
Year,Make,Model,Description,Price
2+
1997,Ford,E350,"ac, abs, moon",3000.00
3+
1999,Chevy,"Venture ""Extended Edition""","",4900.00
4+
1999,Chevy,"Venture ""Extended Edition, Very Large""",,5000.00
5+
1996,Jeep,Grand Cherokee,"MUST SELL!
6+
air, moon roof, loaded",4799.00

detector/detect.go

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
package detector
2+
3+
import (
4+
"bufio"
5+
"regexp"
6+
"math"
7+
"io"
8+
)
9+
10+
const (
11+
sampleLines = 15
12+
nonDelimiterRegexString = `[[:alnum:]\n\r]`
13+
)
14+
15+
// New a detector.
16+
func New() Detector {
17+
return &detector{
18+
nonDelimiterRegex : regexp.MustCompile(nonDelimiterRegexString),
19+
}
20+
}
21+
22+
// Detector defines the exposed interface.
23+
type Detector interface {
24+
DetectDelimiter(reader io.Reader, enclosure byte) []string
25+
}
26+
27+
// detector is the default implementation of Detector.
28+
type detector struct {
29+
nonDelimiterRegex *regexp.Regexp
30+
}
31+
32+
// DetectDelimiter finds a slice of delimiter string.
33+
func (d *detector) DetectDelimiter(reader io.Reader, enclosure byte) []string {
34+
statistics, totalLines := d.sample(reader, sampleLines, enclosure)
35+
36+
var candidates []string
37+
for _, delimiter := range d.analyze(statistics, totalLines) {
38+
candidates = append(candidates, string(delimiter))
39+
}
40+
41+
return candidates
42+
}
43+
44+
// sample reads lines and walks through each character, records the frequencies of each candidate delimiter
45+
// at each line(here we call it the 'frequencyTable'). It also returns the actual sampling lines
46+
// because it might be less than sampleLines.
47+
func (d *detector) sample(reader io.Reader, sampleLines int, enclosure byte) (frequencies frequencyTable, actualSampleLines int) {
48+
bufferedReader := bufio.NewReader(reader)
49+
frequencies = createFrequencyTable()
50+
51+
enclosed := false
52+
actualSampleLines = 1
53+
var prev, current, next byte
54+
var err error
55+
56+
bufSize := 1024
57+
buf := make([]byte, bufSize)
58+
n, err := bufferedReader.Read(buf)
59+
60+
for err == nil {
61+
for i := 0; i < n; i++ {
62+
current = buf[i]
63+
64+
if i > 0 {
65+
prev = buf[i - 1]
66+
} else {
67+
prev = byte(0)
68+
}
69+
70+
if i < n - 1 {
71+
next = buf[i + 1]
72+
} else {
73+
next = byte(0)
74+
}
75+
76+
if current == enclosure {
77+
if (!enclosed || next != enclosure) {
78+
if enclosed {
79+
enclosed = false
80+
} else {
81+
enclosed = true
82+
}
83+
} else {
84+
i++
85+
}
86+
} else if (current == '\n' && prev != '\r' || current == '\r') && !enclosed {
87+
actualSampleLines++
88+
if actualSampleLines >= sampleLines {
89+
break;
90+
}
91+
} else if !enclosed {
92+
if !d.nonDelimiterRegex.MatchString(string(current)) {
93+
frequencies.increment(current, actualSampleLines)
94+
}
95+
}
96+
}
97+
98+
n, err = bufferedReader.Read(buf)
99+
}
100+
101+
return
102+
}
103+
104+
// analyze is built based on such an observation: the delimiter must appears
105+
// the same times at each line, usually, it appears more than once. Therefore
106+
// for each delimiter candidate, the deviation of its frequency at each line
107+
// is calculated, if the deviation is 0, it means it appears the same times at
108+
// each sampled line.
109+
func (d *detector) analyze(ft frequencyTable, sampleLine int) []byte {
110+
mean := func(frequencyOfLine map[int]int, size int) float32 {
111+
total := 0
112+
for i := 1; i <= size; i++ {
113+
if frequency, ok := frequencyOfLine[i]; ok {
114+
total += frequency
115+
}
116+
}
117+
return float32(total) / float32(size)
118+
}
119+
120+
deviation := func(frequencyOfLine map[int]int, size int) float64 {
121+
average := mean(frequencyOfLine, size)
122+
var total float64
123+
for i := 1; i <= size; i++ {
124+
var frequency float32
125+
126+
if v, ok := frequencyOfLine[i]; ok {
127+
frequency = float32(v)
128+
}
129+
130+
d := (average - frequency) * (average - frequency)
131+
total += math.Sqrt(float64(d))
132+
}
133+
134+
return total / float64(size)
135+
}
136+
137+
var candidates []byte
138+
for delimiter, frequencyOfLine := range ft {
139+
if float64(0.0) == deviation(frequencyOfLine, sampleLine) {
140+
candidates = append(candidates, delimiter)
141+
}
142+
}
143+
144+
return candidates
145+
}
146+
147+
// frequencyTable remembers the frequency of character at each line.
148+
// frequencyTable['.'][11] will get the frequency of char '.' at line 11.
149+
type frequencyTable map[byte]map[int]int
150+
151+
// createFrequencyTable constructs a new frequencyTable.
152+
func createFrequencyTable() frequencyTable {
153+
return make(map[byte]map[int]int)
154+
}
155+
156+
// increment the frequency for ch at line.
157+
func (f frequencyTable) increment(ch byte, line int) frequencyTable {
158+
if _, ok := f[ch]; !ok {
159+
f[ch] = make(map[int]int)
160+
}
161+
162+
if _, ok := f[ch][line]; !ok {
163+
f[ch][line] = 0
164+
}
165+
166+
f[ch][line]++
167+
168+
return f
169+
}

detector/detect_test.go

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
package detector
2+
3+
import (
4+
"github.com/stretchr/testify/assert"
5+
"testing"
6+
"os"
7+
"regexp"
8+
)
9+
10+
func TestIsPotentialDelimiter(t *testing.T) {
11+
tests := []struct {
12+
input byte
13+
expected bool
14+
}{
15+
{
16+
byte('a'),
17+
false,
18+
},
19+
{
20+
byte('A'),
21+
false,
22+
},
23+
{
24+
byte('1'),
25+
false,
26+
},
27+
{
28+
byte('|'),
29+
true,
30+
},
31+
{
32+
byte('$'),
33+
true,
34+
},
35+
}
36+
37+
detector := &detector{
38+
nonDelimiterRegex : regexp.MustCompile(nonDelimiterRegexString),
39+
}
40+
for _, test := range tests {
41+
assert.Equal(t, test.expected, !detector.nonDelimiterRegex.MatchString(string(test.input)))
42+
}
43+
}
44+
45+
func TestFrequencyTable(t *testing.T) {
46+
ft := createFrequencyTable()
47+
48+
ft.increment(',', 1).increment(',', 2).increment('|', 3).increment('|', 3)
49+
50+
assert.Equal(t, 1, ft[','][1])
51+
assert.Equal(t, 1, ft[','][2])
52+
assert.Equal(t, 2, ft['|'][3])
53+
}
54+
55+
func TestDetectDelimiter(t *testing.T) {
56+
detector := New()
57+
58+
file, err := os.OpenFile("./Fixtures/test1.csv", os.O_RDONLY, os.ModePerm)
59+
assert.NoError(t, err)
60+
defer file.Close()
61+
62+
delimiters := detector.DetectDelimiter(file, '"')
63+
64+
assert.Equal(t, []string{","}, delimiters)
65+
}
66+
67+
func TestDetectorSample(t *testing.T) {
68+
detector := &detector{}
69+
70+
file, err := os.OpenFile("./Fixtures/test1.csv", os.O_RDONLY, os.ModePerm)
71+
assert.NoError(t, err)
72+
defer file.Close()
73+
74+
actual, line := detector.sample(file, 15, '"')
75+
expected := frequencyTable{
76+
'.' : map[int]int{
77+
2: 1,
78+
3: 1,
79+
4: 1,
80+
5: 1,
81+
},
82+
' ' : map[int]int{
83+
5: 1,
84+
},
85+
',' : map[int]int{
86+
1: 4,
87+
2: 4,
88+
3: 4,
89+
4: 4,
90+
5: 4,
91+
},
92+
}
93+
94+
for k, v := range expected {
95+
assert.Equal(t, v, actual[k])
96+
}
97+
assert.Equal(t, 5, line)
98+
}
99+
100+
func TestDetectAnalyze(t *testing.T) {
101+
ft := frequencyTable{
102+
'.' : map[int]int{
103+
2: 1,
104+
3: 1,
105+
4: 1,
106+
5: 1,
107+
},
108+
' ' : map[int]int{
109+
5: 1,
110+
},
111+
',' : map[int]int{
112+
1: 4,
113+
2: 4,
114+
3: 4,
115+
4: 4,
116+
5: 4,
117+
},
118+
}
119+
120+
detector := &detector{}
121+
candidates := detector.analyze(ft, 5)
122+
123+
assert.Equal(t, []byte{','}, candidates)
124+
}

0 commit comments

Comments
 (0)