diff --git a/.gitignore b/.gitignore index 66fd13c..ab584cd 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,9 @@ *.so *.dylib +main +test + # Test binary, built with `go test -c` *.test diff --git a/cmd/wc-go/main b/cmd/wc-go/main deleted file mode 100755 index 28e7b1a..0000000 Binary files a/cmd/wc-go/main and /dev/null differ diff --git a/pkg/wc/complex-counter.go b/pkg/wc/complex-counter.go index ecc431f..cc7699e 100644 --- a/pkg/wc/complex-counter.go +++ b/pkg/wc/complex-counter.go @@ -12,38 +12,48 @@ type ComplexCount struct { MaxLineLength int64 } -func GetComplexCount(chunk []byte) ComplexCount { +type ComplexChunk struct { + PrevRune rune + Chunk []byte +} + +func GetComplexCount(chunk ComplexChunk) ComplexCount { var count = ComplexCount{} - word := false var lineLength int64 - runes := bytes.Runes(chunk) + runes := bytes.Runes(chunk.Chunk) + prevRuneIsSpace := unicode.IsSpace(chunk.PrevRune) + var linepos int64 for _, b := range runes { count.CharCount++ - if b == '\n' { - if lineLength > count.MaxLineLength { - count.MaxLineLength = lineLength + if b == '\n' || b == '\r' || b == '\f' { + if linepos > lineLength { + lineLength = linepos } - lineLength = 0 - count.LineCount++ - if word { - word = false - count.WordCount++ + linepos = 0 + if b == '\n' { + count.LineCount++ } - } else if unicode.IsSpace(b) { - lineLength++ - if word { - word = false - count.WordCount++ + } + if unicode.IsSpace(b) { + if b == '\t' { + linepos += 8 - (linepos % 8) + } else if b != '\n' && b != '\r' && b != '\f' && b != '\v' { + linepos++ } + prevRuneIsSpace = true } else { - lineLength++ - word = true + linepos++ + if prevRuneIsSpace { + count.WordCount++ + } + prevRuneIsSpace = false } } + count.MaxLineLength = lineLength return count } -func ConcurrentComplexChunkCounter(chunks <-chan []byte, counts chan<- ComplexCount) { +func ConcurrentComplexChunkCounter(chunks <-chan ComplexChunk, counts chan<- ComplexCount) { var totalCount ComplexCount for chunk := range chunks { count := GetComplexCount(chunk) diff --git a/pkg/wc/counter.go b/pkg/wc/counter.go index 7a8125c..129804d 100644 --- a/pkg/wc/counter.go +++ b/pkg/wc/counter.go @@ -4,10 +4,10 @@ import ( "fmt" "io" "os" - "runtime" + "unicode/utf8" ) -const BufferSize = 1024 * 1024 +const BufferSize = 1024 * 1024 * 4 type Counter struct { Words int64 @@ -35,7 +35,7 @@ func Count(filename string, cw, cc, cl, cb, mll bool) { processLine := cw || cc var c = &Counter{} - numWorkers := runtime.NumCPU() + numWorkers := 2 //runtime.NumCPU() if cl && !processLine { c.Lines = CountLines(file, numWorkers) @@ -60,7 +60,7 @@ func Count(filename string, cw, cc, cl, cb, mll bool) { fmt.Printf("%d ", c.Words) } if cc { - fmt.Printf("%d ", c.Words) + fmt.Printf("%d ", c.Chars) } if cb { fmt.Printf("%d ", c.Bytes) @@ -104,17 +104,18 @@ func CountLines(file *os.File, numWorkers int) int64 { func CountComplex(file *os.File, numWorkers int) *Counter { counter := Counter{} - chunks := make(chan []byte, numWorkers) + chunks := make(chan ComplexChunk, numWorkers) counts := make(chan ComplexCount, numWorkers) for i := 0; i < numWorkers; i++ { go ConcurrentComplexChunkCounter(chunks, counts) } - + var lastRune rune = ' ' // Fake the first char being a space so that the first word is counted for { buf := make([]byte, BufferSize) count, err := file.Read(buf) - chunks <- buf[:count] + chunks <- ComplexChunk{lastRune, buf[:count]} + lastRune, _ = utf8.DecodeLastRune(buf[:count]) if err != nil { if err == io.EOF { break