| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379 | <?php/** * 中文分词操作类 * @author wangcanjia * */class segment {	public $rank_dic = array();	public $one_name_dic = array();	public $two_name_dic = array();	public $new_word = array();	public $source_string = '';	public $result_string = '';	public $split_char = ' '; //分隔符	public $SplitLen = 4; //保留词长度	public $especial_char = "和|的|是";	public $new_word_limit = "在|的|与|或|就|你|我|他|她|有|了|是|其|能|对|地";	public $common_unit = "年|月|日|时|分|秒|点|元|百|千|万|亿|位|辆";	public $cn_number = "0|1|2|3|4|5|6|7|8|9|+|-|%|.|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s |t|u|v|w|x|y|z|A|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z";	public $cn_sg_num = "一|二|三|四|五|六|七|八|九|十|百|千|万|亿|数";	public $max_len = 13; //词典最大 7 中文字,这里的数值为字节数组的最大索引	public $min_len = 3;  //最小 2 中文字,这里的数值为字节数组的最大索引	public $cn_two_name = "端木 南宫 谯笪 轩辕 令狐 钟离 闾丘 长孙 鲜于 宇文 司徒 司空 上官 欧阳 公孙 西门 东门 左丘 东郭 呼延 慕容 司马 夏侯 诸葛 东方 赫连 皇甫 尉迟 申屠";	public $cn_one_name = "赵钱孙李周吴郑王冯陈褚卫蒋沈韩杨朱秦尤许何吕施张孔曹严华金魏陶姜戚谢邹喻柏水窦章云苏潘葛奚范彭郎鲁韦昌马苗凤花方俞任袁柳酆鲍史唐费廉岑薛雷贺倪汤滕殷罗毕郝邬安常乐于时傅皮卡齐康伍余元卜顾孟平黄穆萧尹姚邵堪汪祁毛禹狄米贝明臧计伏成戴谈宋茅庞熊纪舒屈项祝董粱杜阮蓝闵席季麻强贾路娄危江童颜郭梅盛林刁钟徐邱骆高夏蔡田樊胡凌霍虞万支柯咎管卢莫经房裘缪干解应宗宣丁贲邓郁单杭洪包诸左石崔吉钮龚程嵇邢滑裴陆荣翁荀羊於惠甄魏加封芮羿储靳汲邴糜松井段富巫乌焦巴弓牧隗谷车侯宓蓬全郗班仰秋仲伊宫宁仇栾暴甘钭厉戎祖武符刘姜詹束龙叶幸司韶郜黎蓟薄印宿白怀蒲台从鄂索咸籍赖卓蔺屠蒙池乔阴郁胥能苍双闻莘党翟谭贡劳逄姬申扶堵冉宰郦雍郤璩桑桂濮牛寿通边扈燕冀郏浦尚农温别庄晏柴翟阎充慕连茹习宦艾鱼容向古易慎戈廖庚终暨居衡步都耿满弘匡国文寇广禄阙东殴殳沃利蔚越夔隆师巩厍聂晁勾敖融冷訾辛阚那简饶空曾沙须丰巢关蒯相查后江游竺";  	function __construct($loaddic=true) {  	if($loaddic) {  	  for($i=0;$i<strlen($this->cn_one_name);$i++){  		  $this->one_name_dic[$this->cn_one_name[$i].$this->cn_one_name[$i+1]] = 1;  		  $i++;  	  }  	  $twoname = explode(" ",$this->cn_two_name);  	  foreach($twoname as $n){ $this->two_name_dic[$n] = 1; }  	  unset($twoname);  	  unset($this->cn_two_name);  	  unset($this->cn_one_name);  	  $dicfile = PC_PATH.'libs'.DIRECTORY_SEPARATOR.'data'.DIRECTORY_SEPARATOR.'dict'.DIRECTORY_SEPARATOR.'dict.csv';  	  $fp = fopen($dicfile,'r');  	  while($line = fgets($fp,64)){  		  $ws = explode(' ',$line);  		  $this->rank_dic[strlen($ws[0])][$ws[0]] = $ws[1];  	  }  	  fclose($fp);    }  }  function clear() {  	unset($this->rank_dic);  }  function get_source($str) {  	if(CHARSET == 'utf-8') $str = iconv('utf-8','gbk',$str);  	$this->source_string = $str;  	$this->result_string = '';  }  function simple_split($str) {  	$this->source_string = $this->revise_string($str);  	return $this->source_string;  }  function split_result($str='',$try_num_name=true,$try_diff=true) {  	$str = trim($str);  	if($str!='') $this->get_source($str);  	else return '';  	$this->source_string = preg_replace('/ {1,}/',' ',$this->revise_string($this->source_string));  	$spwords = explode(' ',$this->source_string);  	$spLen = count($spwords) - 1;  	$spc = $this->split_char;  	for($i=$spLen;$i>=0;$i--){  		if(ord($spwords[$i][0])<33) continue;  		else if(!isset($spwords[$i][$this->min_len])) $this->result_string = $spwords[$i].$spc.$this->result_string;  		else if(ord($spwords[$i][0])<0x81){  			$this->result_string = $spwords[$i].$spc.$this->result_string;  		} else {  		  $this->result_string = $this->split_mm($spwords[$i],$try_num_name,$try_diff).$spc.$this->result_string;  	  }  	}  	if(CHARSET=='utf-8') $okstr = iconv('gbk','utf-8',$this->result_string);  	else $okstr = $this->result_string;  	return $okstr;  }  function par_number($str) {  	if($str == '') return '';  	$ws = explode(' ',$str);  	$wlen = count($ws);  	$spc = $this->split_char;  	$reStr = '';  	for($i=0;$i<$wlen;$i++){  		if($ws[$i]=='') continue;  		if($i>=$wlen-1) $reStr .= $spc.$ws[$i];  		else{ $reStr .= $spc.$ws[$i]; }    }    return $reStr;  }  function par_other($word_array) {  	$wlen = count($word_array)-1;  	$rsStr = '';  	$spc = $this->split_char;  	for($i=$wlen;$i>=0;$i--) {  		if(preg_match('/'.$this->cn_sg_num.'/',$word_array[$i])) {  			$rsStr .= $spc.$word_array[$i];  			if($i>0 && preg_match('/^'.$this->common_unit.'/',$word_array[$i-1]) ) {				$rsStr .= $word_array[$i-1]; $i--;			} else {  				while($i>0 && preg_match("/".$this->cn_sg_num."/",$word_array[$i-1]) ){ $rsStr .= $word_array[$i-1]; $i--; }  			}  			continue;  		}  		if(strlen($word_array[$i])==4 && isset($this->two_name_dic[$word_array[$i]])) {  			$rsStr .= $spc.$word_array[$i];  			if($i>0&&strlen($word_array[$i-1])==2){  				$rsStr .= $word_array[$i-1];$i--;  				if($i>0&&strlen($word_array[$i-1])==2){ $rsStr .= $word_array[$i-1];$i--; }  			}  		} else if(strlen($word_array[$i])==2 && isset($this->one_name_dic[$word_array[$i]])) {  			$rsStr .= $spc.$word_array[$i];  			if($i>0&&strlen($word_array[$i-1])==2){  				 if(preg_match("/".$this->especial_char."/",$word_array[$i-1])) continue;  				 $rsStr .= $word_array[$i-1];$i--;  				 if($i>0 && strlen($word_array[$i-1])==2 &&  				  !preg_match("/".$this->especial_char."/",$word_array[$i-1]))  				 { $rsStr .= $word_array[$i-1];$i--; }  			}  		} else {  			$rsStr .= $spc.$word_array[$i];  		}  	}  	$rsStr = preg_replace("/^".$spc."/","",$rsStr);  	return $rsStr;  }  function split_mm($str,$try_num_name=true,$try_diff=true) {  	$spc = $this->split_char;  	$spLen = strlen($str);  	$rsStr = $okWord = $tmpWord = '';  	$word_array = array();  	for($i=($spLen-1);$i>=0;) {  		if($i<=$this->min_len){  			if($i==1){  			  $word_array[] = substr($str,0,2);  		  } else {  			   $w = substr($str,0,$this->min_len+1);  			   if($this->is_word($w)){  			   	$word_array[] = $w;  			   }else{  				   $word_array[] = substr($str,2,2);  				   $word_array[] = substr($str,0,2);  			   }  		  }  			$i = -1; break;  		}  		if($i>=$this->max_len) $max_pos = $this->max_len;  		else $max_pos = $i;  		$isMatch = false;  		for($j=$max_pos;$j>=0;$j=$j-2){  			 $w = substr($str,$i-$j,$j+1);  			 if($this->is_word($w)){  			 	$word_array[] = $w;  			 	$i = $i-$j-1;  			 	$isMatch = true;  			 	break;  			 }  		}  		if(!$isMatch){  			if($i>1) {  				$word_array[] = $str[$i-1].$str[$i];  				$i = $i-2;  			}  		}  	}//End For  	if($try_num_name) {		$rsStr = $this->par_other($word_array);	} else {  		$wlen = count($word_array)-1;  		for($i=$wlen;$i>=0;$i--){  	  	$rsStr .= $spc.$word_array[$i];  	  }  	}  	if($try_diff) $rsStr = $this->test_diff(trim($rsStr));  	return $rsStr;  }  function auto_description($str,$keyword,$strlen) {  	$this->source_string = $this->revise_string($this->source_string);  	$spwords = explode(" ",$this->source_string);  	$keywords = explode(" ",$this->keywords);  	$regstr = "";  	foreach($keywords as $k=>$v) {  		if($v=="") continue;  		if(ord($v[0])>0x80 && strlen($v)<3) continue;  		if($regstr=="") $regstr .= "($v)";  		else $regstr .= "|($v)";  	}  }  function test_diff($str) {  	$str = preg_replace("/ {1,}/"," ",$str);  	if($str == ""||$str == " ") return "";  	$ws = explode(' ',$str);  	$wlen = count($ws);  	$spc = $this->split_char;  	$reStr = "";  	for($i=0;$i<$wlen;$i++) {  		if($i>=($wlen-1)) {  			$reStr .= $spc.$ws[$i];  		} else {  			if($ws[$i]==$ws[$i+1]){  				$reStr .= $spc.$ws[$i].$ws[$i+1];  				$i++; continue;  			}  			if(strlen($ws[$i])==2 && strlen($ws[$i+1])<8 && strlen($ws[$i+1])>2) {  				$addw = $ws[$i].$ws[$i+1];  				$t = 6;  				$testok = false;  				while($t>=4) {  				  $w = substr($addw,0,$t);  				  if($this->is_word($w) && ($this->get_rank($w) > $this->get_rank($ws[$i+1])*2) ) {  					   $limit_word = substr($ws[$i+1],strlen($ws[$i+1])-$t-2,strlen($ws[$i+1])-strlen($w)+2);  					   if($limit_word!="") $reStr .= $spc.$w.$spc.$limit_word;  					   else $reStr .= $spc.$w;  					   $testok = true;  					   break;  				  }  				  $t = $t-2;  			  }  			  if(!$testok) $reStr .= $spc.$ws[$i];  			  else $i++;  			} else if(strlen($ws[$i])>2 && strlen($ws[$i])<8 && strlen($ws[$i+1])>2 && strlen($ws[$i+1])<8) {  				$t21 = substr($ws[$i+1],0,2);  				$t22 = substr($ws[$i+1],0,4);  				if($this->is_word($ws[$i].$t21)) {  					if(strlen($ws[$i])==6||strlen($ws[$i+1])==6){  						$reStr .= $spc.$ws[$i].$t21.$spc.substr($ws[$i+1],2,strlen($ws[$i+1])-2);  						$i++;  					} else {  						$reStr .= $spc.$ws[$i];  					}  				} else if(strlen($ws[$i+1])==6) {  					if($this->is_word($ws[$i].$t22)) {  						$reStr .= $spc.$ws[$i].$t22.$spc.$ws[$i+1][4].$ws[$i+1][5];  						$i++;  					} else { $reStr .= $spc.$ws[$i]; }  				} else if(strlen($ws[$i+1])==4) {  					$addw = $ws[$i].$ws[$i+1];  					$t = strlen($ws[$i+1])-2;  					$testok = false;  					while($t>0) {  						$w = substr($addw,0,strlen($ws[$i])+$t);  						if($this->is_word($w) && ($this->get_rank($w) > $this->get_rank($ws[$i+1])*2) ) {  				       $limit_word = substr($ws[$i+1],$t,strlen($ws[$i+1])-$t);  					     if($limit_word!="") $reStr .= $spc.$w.$spc.$limit_word;  					     else $reStr .= $spc.$w;  					     $testok = true;  					     break;  				    }  				    $t = $t-2;  					}  					if(!$testok) $reStr .= $spc.$ws[$i];  			    else $i++;  				}else {  					$reStr .= $spc.$ws[$i];  				}  			} else {  				$reStr .= $spc.$ws[$i];  			}  		}    }//End For  	return $reStr;  }  function is_word($okWord){  	$slen = strlen($okWord);  	if($slen > $this->max_len) return false;  	else return isset($this->rank_dic[$slen][$okWord]);  }  function revise_string($str) {  	$spc = $this->split_char;    $slen = strlen($str);    if($slen==0) return '';    $okstr = '';    $prechar = 0; // 0-空白 1-英文 2-中文 3-符号    for($i=0;$i<$slen;$i++){      if(ord($str[$i]) < 0x81) {        if(ord($str[$i]) < 33){          //$str[$i]!="\r"&&$str[$i]!="\n"          if($prechar!=0) $okstr .= $spc;          $prechar=0;          continue;        } else if(preg_match("/[^0-9a-zA-Z@\.%#:\\/\\&_-]/",$str[$i])) {          if($prechar==0) {          	$okstr .= $str[$i]; $prechar=3;          } else {          	$okstr .= $spc.$str[$i]; $prechar=3;          }        } else {        	if($prechar==2||$prechar==3) {        		$okstr .= $spc.$str[$i]; $prechar=1;        	} else {        	  if(preg_match("/@#%:/",$str[$i])){ $okstr .= $str[$i]; $prechar=3; }        	  else { $okstr .= $str[$i]; $prechar=1; }        	}        }      } else{        if($prechar!=0 && $prechar!=2) $okstr .= $spc;        if(isset($str[$i+1])){          $c = $str[$i].$str[$i+1];          if(preg_match("/".$this->cn_number."/",$c)) {          	$okstr .= $this->get_alab_num($c); $prechar = 2; $i++; continue;          }          $n = hexdec(bin2hex($c));          if($n>0xA13F && $n < 0xAA40) {            if($c=="《"){            	if($prechar!=0) $okstr .= $spc." 《";            	else $okstr .= " 《";            	$prechar = 2;            } else if($c=="》"){            	$okstr .= "》 ";            	$prechar = 3;            } else{            	if($prechar!=0) $okstr .= $spc.$c;            	else $okstr .= $c;            	$prechar = 3;            }          } else {            $okstr .= $c;            $prechar = 2;          }          $i++;        }      }//中文字符    }//结束循环    return $okstr;  }  function find_new_word($str,$maxlen=6) {    $okstr = "";    return $str;  }  function get_keyword($str,$ilen=-1) {    if($str=='') return '';    else $this->split_result($str,true,true);    $okstr = $this->result_string;    $ws = explode(' ',$okstr);    $okstr = $wks = '';    foreach($ws as $w) {      $w = trim($w);      if(strlen($w)<2) continue;      if(!preg_match("/[^0-9:-]/",$w)) continue;      if(strlen($w)==2&&ord($w[0])>0x80) continue;      if(isset($wks[$w])) $wks[$w]++;      else $wks[$w] = 1;    }    if(is_array($wks)) {      arsort($wks);      if($ilen==-1) {		foreach($wks as $w=>$v) {      		if($this->get_rank($w)>500) $okstr .= $w." ";        }      }  else {        foreach($wks as $w=>$v){          if((strlen($okstr)+strlen($w)+1)<$ilen) $okstr .= $w." ";          else break;        }      }    }    if(CHARSET=='utf-8') $okstr = iconv('gbk','utf-8',$okstr);    return trim($okstr);  }  function get_rank($w){  	if(isset($this->rank_dic[strlen($w)][$w])) return $this->rank_dic[strlen($w)][$w];  	else return 0;  }  function get_alab_num($fnum){	  $nums = array("0","1","2","3","4","5","6",	  "7","8","9","+","-","%",".",	  "a","b","c","d","e","f","g","h","i","j","k","l","m",	  "n","o","p","q","r","s ","t","u","v","w","x","y","z",	  "A","B","C","D","E","F","G","H","I","J","K","L","M",	  "N","O","P","Q","R","S","T","U","V","W","X","Y","Z");	  $fnums = "0123456789+-%.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";	  $fnum = str_replace($nums,$fnums,$fnum);	  return $fnum;  }}?>
 |