C# 中文分词[基于统计的朴素贝叶斯算法]

主要思想:

1. 要有一个语料库

2. 统计每个词出现的频率, 一会来做朴素贝叶斯候选

3. 举例: 中国人民共和国的

    其中语料库中有中国, 人民, 中国人, 共和国等等的词组.

现在输入: 中国人都爱中华人民共和国;

分词的时候取max( 各种分发得到的score );

例如: solution1：中国人_都爱中华人民_共和国

solution2：中国_人_都爱中华人民_共和国

solution3：中国_人_都爱_中华_人民_共和国

bestSegSolution = max( solutions(segSlution[i] ));

4.对于一句汉字的分词可以看做

seg( StringIn ) = firPart + seg(StringIn – firPart); // 我用score来衡量当前分词结果的好坏

6。朴素贝叶斯的意思就是: 分词后的, 两个词之间是相互独立的, 也就是后者的出现与前者无关

5. 这个只是初级版, 很简单, 需要再加点东西, 结果会更加的完美.. 当然, 按照做事情的原则, 都是从简单开始做的, 再努力

using System;
using System.Collections.Generic;
using System.Text;
using System.Collections;
using System.Windows.Forms;
using System.IO;
using System.Diagnostics;

namespace ChineseWordSeg
{
    class NaiveBayes
    {
        private string wordLibPath = "../WordLib/pku_training.txt";//所用的训练库是pku的语料库.

        bool trained = false;
        private Dictionary<string, long> wordLib = new Dictionary<string, long>();
        private Dictionary<string, long> singleWordLib = new Dictionary<string, long>();
        int maxLen = 0;
        long maxScore = 0;
        private string segPos = ""; //记录单句的分割点, 按照标点等非汉字的字符分开
        private string segSentence = ""; // 记录整个段落的

        // 是不是中文字符

        bool isChineseWord(char chr ){
            if (chr >= 0x4E00 && chr <= 0x9FFF) return true;
            return false;
        }

        public void trainDate( string path ) {
 // 统计每个词出现的次数

//1. 统计每个词组频率, naiveBayes消歧. 将一个组合不同的方式取得较大概率的那个分组方式.
// 难道每个词还是hash一下么?
//2. 统计每个字的频率, 就像向心力那样... 看看到底哪两个字比较容易联系到一起这个是一句废话,因为我没这么去做
            wordLib.Clear();

            DirectoryInfo dirInfo = new DirectoryInfo(path);
            DirectoryInfo tmpDir = dirInfo.Parent;
            string savePath = tmpDir.FullName;
            FileInfo fInfo = new FileInfo(wordLibPath);
            string fileNamePre = fInfo.Name;
            savePath += "\" + fileNamePre + "_trained";
            FileInfo infoOfDB = new FileInfo(savePath);

            if( File.Exists(savePath) && infoOfDB.Length > 0 ){

                StreamReader sr1 =
                                new StreamReader(@savePath);
                char[] sep = { };

                while (sr1.Peek()!=-1)
                {
                    string[] keyValue = sr1.ReadLine().Split(sep);

                    wordLib[keyValue[0]] = Convert.ToInt32(keyValue[1]);

                }

                    return;
            }

补充：软件开发 , C# ,