-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathpatterns.go
79 lines (70 loc) · 1.37 KB
/
patterns.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
package textcat
import (
"regexp"
"sort"
"strings"
)
const (
MaxPatterns = 400
)
var (
reInvalid = regexp.MustCompile("[^\\p{L}]+")
)
type countType struct {
S string
I int
}
type countsType []*countType
func (c countsType) Len() int {
return len(c)
}
func (c countsType) Swap(i, j int) {
c[i], c[j] = c[j], c[i]
}
func (c countsType) Less(i, j int) bool {
if c[i].I != c[j].I {
return c[i].I > c[j].I
}
return c[i].S < c[j].S
}
func GetPatterns(s string, useRunes bool) []*countType {
ngrams := make(map[string]int)
if useRunes {
s = strings.ToLower(reInvalid.ReplaceAllString(s, " "))
for _, word := range strings.Fields(s) {
b := []rune("_" + word + "____")
n := len(b) - 4
for i := 0; i < n; i++ {
for j := 1; j < 6; j++ {
s = string(b[i : i+j])
if !strings.HasSuffix(s, "__") {
ngrams[s] += 1
}
}
}
}
} else {
for _, word := range strings.Fields(s) {
b := []byte("_" + word + "____")
n := len(b) - 4
for i := 0; i < n; i++ {
for j := 1; j < 6; j++ {
s = string(b[i : i+j])
if !strings.HasSuffix(s, "__") {
ngrams[s] += 1
}
}
}
}
}
size := len(ngrams)
counts := make([]*countType, 0, size)
for i := range ngrams {
counts = append(counts, &countType{i, ngrams[i]})
}
sort.Sort(countsType(counts))
if size > MaxPatterns {
counts = counts[:MaxPatterns]
}
return counts
}