stem.go 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
  1. package main
  2. import (
  3. "bufio"
  4. "bytes"
  5. "github.com/fjl/go-couchdb"
  6. "github.com/kljensen/snowball"
  7. "github.com/kpmy/tier"
  8. "github.com/kpmy/ypk/halt"
  9. "log"
  10. "reflect"
  11. "strings"
  12. "sync"
  13. "time"
  14. "unicode"
  15. )
  16. var defOpts tier.Opts = tier.DefaultOpts
  17. var wordsDb *couchdb.DB
  18. var words *Words
  19. type Words struct {
  20. sync.Mutex
  21. last int64
  22. wm map[string]map[string]int
  23. }
  24. func (w *Words) New() *Words {
  25. w = new(Words)
  26. w.wm = make(map[string]map[string]int)
  27. return w
  28. }
  29. func (w *Words) Load() (err error) {
  30. w.Lock()
  31. const docId = "map"
  32. tmp := make(map[string]interface{})
  33. if err = wordsDb.Get(docId, &tmp, nil); err == nil {
  34. for k, _v := range tmp {
  35. switch v := _v.(type) {
  36. case map[string]interface{}:
  37. mm := make(map[string]int)
  38. for k, _v := range v {
  39. switch v := _v.(type) {
  40. case float64:
  41. mm[k] = int(v)
  42. default:
  43. halt.As(100, k, " ", reflect.TypeOf(v))
  44. }
  45. }
  46. w.wm[k] = mm
  47. case string: //do nothing
  48. default:
  49. halt.As(100, k, " ", reflect.TypeOf(v))
  50. }
  51. }
  52. } else if couchdb.NotFound(err) {
  53. if _, err = wordsDb.Put(docId, tmp, ""); err == nil {
  54. err = w.Load()
  55. }
  56. } else {
  57. log.Println(err)
  58. }
  59. w.Unlock()
  60. return
  61. }
  62. func (w *Words) Store() {
  63. w.Lock()
  64. const docId = "map"
  65. if rev, err := wordsDb.Rev(docId); err == nil {
  66. if _, err = wordsDb.Put(docId, w.wm, rev); err != nil {
  67. log.Println(err)
  68. }
  69. }
  70. w.Unlock()
  71. }
  72. func (w *Words) Sync() {
  73. go func() {
  74. last := time.Now().Unix()
  75. for {
  76. if last != w.last {
  77. w.Store()
  78. last = w.last
  79. }
  80. <-time.After(time.Second)
  81. }
  82. }()
  83. }
  84. const letterSym = ""
  85. func init() {
  86. defOpts.IdentContains = func() string {
  87. return letterSym
  88. }
  89. defOpts.IdentStarts = func() string {
  90. return letterSym
  91. }
  92. defOpts.Skip = func(r rune) bool {
  93. return true
  94. }
  95. defOpts.NoStrings = true
  96. defOpts.NoNum = true
  97. words = words.New()
  98. words.Load()
  99. words.Sync()
  100. }
  101. type WordItem struct {
  102. word string
  103. lang string
  104. }
  105. func detectLang(s string) (lang string) {
  106. lm := make(map[string]string)
  107. for _, r := range strings.ToLower(s) {
  108. if !strings.ContainsRune(letterSym, r) {
  109. switch {
  110. case unicode.IsNumber(r):
  111. lm["mixed"] = "mixed"
  112. case unicode.In(r, unicode.Cyrillic):
  113. lm["russian"] = "russian"
  114. case unicode.In(r, unicode.Latin):
  115. lm["english"] = "english"
  116. }
  117. }
  118. }
  119. if len(lm) == 1 {
  120. for _, l := range lm {
  121. lang = l
  122. }
  123. } else {
  124. lang = ""
  125. }
  126. return
  127. }
  128. func split(s string) (ret chan WordItem) {
  129. ret = make(chan WordItem, 0)
  130. go func() {
  131. sc := tier.NewScanner(bufio.NewReader(bytes.NewBufferString(s)), defOpts)
  132. for sc.Error() == nil {
  133. sym := sc.Get()
  134. if sym.Code == tier.Ident {
  135. w := WordItem{word: strings.ToLower(sym.Value)}
  136. w.lang = detectLang(w.word)
  137. if w.lang != "" {
  138. ret <- w
  139. }
  140. }
  141. }
  142. close(ret)
  143. }()
  144. return
  145. }
  146. func Stem(msg string) {
  147. defer func() {
  148. recover()
  149. }()
  150. for w := range split(msg) {
  151. if stem, err := snowball.Stem(w.word, w.lang, true); err == nil {
  152. var m map[string]int
  153. ok := false
  154. if m, ok = words.wm[stem]; !ok {
  155. m = make(map[string]int)
  156. words.wm[stem] = m
  157. }
  158. if x, ok := m[w.word]; ok {
  159. m[w.word] = x + 1
  160. } else {
  161. m[w.word] = 1
  162. }
  163. }
  164. }
  165. words.last = time.Now().Unix()
  166. //log.Println(words.wm)
  167. }