done.

wangyi-yd · wangyi-yd · commit a9cd996a4af3 · 2015-12-11T17:02:46.000+01:00
diff --git a/detector/Fixtures/test1.csv b/detector/Fixtures/test1.csv
@@ -0,0 +1,6 @@
+Year,Make,Model,Description,Price
+1997,Ford,E350,"ac, abs, moon",3000.00
+1999,Chevy,"Venture ""Extended Edition""","",4900.00
+1999,Chevy,"Venture ""Extended Edition, Very Large""",,5000.00
+1996,Jeep,Grand Cherokee,"MUST SELL!
+air, moon roof, loaded",4799.00
diff --git a/detector/detect.go b/detector/detect.go
@@ -0,0 +1,169 @@
+package detector
+
+import (
+	"bufio"
+	"regexp"
+	"math"
+	"io"
+)
+
+const (
+	sampleLines = 15
+	nonDelimiterRegexString = `[[:alnum:]\n\r]`
+)
+
+// New a detector.
+func New() Detector {
+	return &detector{
+		nonDelimiterRegex : regexp.MustCompile(nonDelimiterRegexString),
+	}
+}
+
+// Detector defines the exposed interface.
+type Detector interface {
+	DetectDelimiter(reader io.Reader, enclosure byte) []string
+}
+
+// detector is the default implementation of Detector.
+type detector struct {
+	nonDelimiterRegex *regexp.Regexp
+}
+
+// DetectDelimiter finds a slice of delimiter string.
+func (d *detector) DetectDelimiter(reader io.Reader, enclosure byte) []string {
+	statistics, totalLines := d.sample(reader, sampleLines, enclosure)
+
+	var candidates []string
+	for _, delimiter := range d.analyze(statistics, totalLines) {
+		candidates = append(candidates, string(delimiter))
+	}
+
+	return candidates
+}
+
+// sample reads lines and walks through each character, records the frequencies of each candidate delimiter
+// at each line(here we call it the 'frequencyTable'). It also returns the actual sampling lines
+// because it might be less than sampleLines.
+func (d *detector) sample(reader io.Reader, sampleLines int, enclosure byte) (frequencies frequencyTable, actualSampleLines int) {
+	bufferedReader := bufio.NewReader(reader)
+	frequencies = createFrequencyTable()
+
+	enclosed := false
+	actualSampleLines = 1
+	var prev, current, next byte
+	var err error
+
+	bufSize := 1024
+	buf := make([]byte, bufSize)
+	n, err := bufferedReader.Read(buf)
+
+	for err == nil {
+		for i := 0; i < n; i++ {
+			current = buf[i]
+
+			if i > 0 {
+				prev = buf[i - 1]
+			} else {
+				prev = byte(0)
+			}
+
+			if i < n - 1 {
+				next = buf[i + 1]
+			} else {
+				next = byte(0)
+			}
+
+			if current == enclosure {
+				if (!enclosed || next != enclosure) {
+					if enclosed {
+						enclosed = false
+					} else {
+						enclosed = true
+					}
+				} else {
+					i++
+				}
+			} else if (current == '\n' && prev != '\r' || current == '\r') && !enclosed {
+				actualSampleLines++
+				if actualSampleLines >= sampleLines {
+					break;
+				}
+			} else if !enclosed {
+				if !d.nonDelimiterRegex.MatchString(string(current)) {
+					frequencies.increment(current, actualSampleLines)
+				}
+			}
+		}
+
+		n, err = bufferedReader.Read(buf)
+	}
+
+	return
+}
+
+// analyze is built based on such an observation: the delimiter must appears
+// the same times at each line, usually, it appears more than once. Therefore
+// for each delimiter candidate, the deviation of its frequency at each line
+// is calculated, if the deviation is 0, it means it appears the same times at
+// each sampled line.
+func (d *detector) analyze(ft frequencyTable, sampleLine int) []byte {
+	mean := func(frequencyOfLine map[int]int, size int) float32 {
+		total := 0
+		for i := 1; i <= size; i++ {
+			if frequency, ok := frequencyOfLine[i]; ok {
+				total += frequency
+			}
+		}
+		return float32(total) / float32(size)
+	}
+
+	deviation := func(frequencyOfLine map[int]int, size int) float64 {
+		average := mean(frequencyOfLine, size)
+		var total float64
+		for i := 1; i <= size; i++ {
+			var frequency float32
+
+			if v, ok := frequencyOfLine[i]; ok {
+				frequency = float32(v)
+			}
+
+			d := (average - frequency) * (average - frequency)
+			total += math.Sqrt(float64(d))
+		}
+
+		return total / float64(size)
+	}
+
+	var candidates []byte
+	for delimiter, frequencyOfLine := range ft {
+		if float64(0.0) == deviation(frequencyOfLine, sampleLine) {
+			candidates = append(candidates, delimiter)
+		}
+	}
+
+	return candidates
+}
+
+// frequencyTable remembers the frequency of character at each line.
+// frequencyTable['.'][11] will get the frequency of char '.' at line 11.
+type frequencyTable map[byte]map[int]int
+
+// createFrequencyTable constructs a new frequencyTable.
+func createFrequencyTable() frequencyTable {
+	return make(map[byte]map[int]int)
+}
+
+// increment the frequency for ch at line.
+func (f frequencyTable) increment(ch byte, line int) frequencyTable {
+	if _, ok := f[ch]; !ok {
+		f[ch] = make(map[int]int)
+	}
+
+	if _, ok := f[ch][line]; !ok {
+		f[ch][line] = 0
+	}
+
+	f[ch][line]++
+
+	return f
+}
diff --git a/detector/detect_test.go b/detector/detect_test.go
@@ -0,0 +1,124 @@
+package detector
+
+import (
+	"github.com/stretchr/testify/assert"
+	"testing"
+	"os"
+	"regexp"
+)
+
+func TestIsPotentialDelimiter(t *testing.T) {
+	tests := []struct {
+		input    byte
+		expected bool
+	}{
+		{
+			byte('a'),
+			false,
+		},
+		{
+			byte('A'),
+			false,
+		},
+		{
+			byte('1'),
+			false,
+		},
+		{
+			byte('|'),
+			true,
+		},
+		{
+			byte('$'),
+			true,
+		},
+	}
+
+	detector := &detector{
+		nonDelimiterRegex : regexp.MustCompile(nonDelimiterRegexString),
+	}
+	for _, test := range tests {
+		assert.Equal(t, test.expected, !detector.nonDelimiterRegex.MatchString(string(test.input)))
+	}
+}
+
+func TestFrequencyTable(t *testing.T) {
+	ft := createFrequencyTable()
+
+	ft.increment(',', 1).increment(',', 2).increment('|', 3).increment('|', 3)
+
+	assert.Equal(t, 1, ft[','][1])
+	assert.Equal(t, 1, ft[','][2])
+	assert.Equal(t, 2, ft['|'][3])
+}
+
+func TestDetectDelimiter(t *testing.T) {
+	detector := New()
+
+	file, err := os.OpenFile("./Fixtures/test1.csv", os.O_RDONLY, os.ModePerm)
+	assert.NoError(t, err)
+	defer file.Close()
+
+	delimiters := detector.DetectDelimiter(file, '"')
+
+	assert.Equal(t, []string{","}, delimiters)
+}
+
+func TestDetectorSample(t *testing.T) {
+	detector := &detector{}
+
+	file, err := os.OpenFile("./Fixtures/test1.csv", os.O_RDONLY, os.ModePerm)
+	assert.NoError(t, err)
+	defer file.Close()
+
+	actual, line := detector.sample(file, 15, '"')
+	expected := frequencyTable{
+		'.' : map[int]int{
+			2: 1,
+			3: 1,
+			4: 1,
+			5: 1,
+		},
+		' ' : map[int]int{
+			5: 1,
+		},
+		',' : map[int]int{
+			1: 4,
+			2: 4,
+			3: 4,
+			4: 4,
+			5: 4,
+		},
+	}
+
+	for k, v := range expected {
+		assert.Equal(t, v, actual[k])
+	}
+	assert.Equal(t, 5, line)
+}
+
+func TestDetectAnalyze(t *testing.T) {
+	ft := frequencyTable{
+		'.' : map[int]int{
+			2: 1,
+			3: 1,
+			4: 1,
+			5: 1,
+		},
+		' ' : map[int]int{
+			5: 1,
+		},
+		',' : map[int]int{
+			1: 4,
+			2: 4,
+			3: 4,
+			4: 4,
+			5: 4,
+		},
+	}
+
+	detector := &detector{}
+	candidates := detector.analyze(ft, 5)
+
+	assert.Equal(t, []byte{','}, candidates)
+}