-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.go
114 lines (97 loc) · 2.1 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
package main
import (
"fmt"
"io"
"os"
"runtime"
"sort"
"strings"
"unicode"
"github.com/dustin/go-wikiparse"
"github.com/escholtz/segment"
"golang.org/x/text/transform"
"golang.org/x/text/unicode/norm"
)
const indexName = "enwiki-20180701-pages-articles-multistream-index.txt.bz2"
const dataName = "enwiki-20180701-pages-articles-multistream.xml.bz2"
func isMn(r rune) bool {
// Mn: nonspacing marks
return unicode.Is(unicode.Mn, r)
}
// Replace accents and convert string to standard form.
// http://blog.golang.org/normalization#TOC_10.
func removeAccents(s string) string {
// Don't think this is thread safe - can't be global
t := transform.Chain(norm.NFD, transform.RemoveFunc(isMn), norm.NFC)
c, _, err := transform.String(t, s)
if err == nil {
return c
}
return s
}
func normalize(s string) string {
s = strings.ToLower(s)
s = removeAccents(s)
return s
}
type Pair struct {
Key string
Value int
}
func main() {
p, err := wikiparse.NewIndexedParser(
indexName,
dataName,
runtime.GOMAXPROCS(0))
if err != nil {
fmt.Fprintf(os.Stderr, "Error setting up parser: %v", err)
os.Exit(1)
}
tokenCount := map[string]int{}
for {
var page *wikiparse.Page
page, err := p.Next()
if err == io.EOF {
break
}
if err != nil {
fmt.Fprintf(os.Stderr, "%v", err)
break
}
// Skip pages that aren't articles
if page.Ns != 0 {
continue
}
// Skip redirects
if len(page.Redir.Title) > 0 {
continue
}
r := strings.NewReader(page.Title)
seg := segment.NewWordSegmenter(r)
for seg.Segment() {
if seg.Type() == segment.None {
continue
}
text := normalize(seg.Text())
tokenCount[text]++
}
}
pairs := make([]Pair, len(tokenCount))
total := 0
i := 0
for k, v := range tokenCount {
pairs[i] = Pair{k, v}
total += v
i++
}
sort.SliceStable(pairs, func(i, j int) bool {
if pairs[i].Value != pairs[j].Value {
return pairs[i].Value > pairs[j].Value
}
return strings.Compare(pairs[i].Key, pairs[j].Key) < 0
})
for _, p := range pairs {
f := 100.0 * (float64(p.Value) / float64(total))
fmt.Printf("%s\t%d\t%.6f\n", p.Key, p.Value, f)
}
}