segment.class.php 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379
  1. <?php
  2. /**
  3. * 中文分词操作类
  4. * @author wangcanjia
  5. *
  6. */
  7. class segment {
  8. public $rank_dic = array();
  9. public $one_name_dic = array();
  10. public $two_name_dic = array();
  11. public $new_word = array();
  12. public $source_string = '';
  13. public $result_string = '';
  14. public $split_char = ' '; //分隔符
  15. public $SplitLen = 4; //保留词长度
  16. public $especial_char = "和|的|是";
  17. public $new_word_limit = "在|的|与|或|就|你|我|他|她|有|了|是|其|能|对|地";
  18. public $common_unit = "年|月|日|时|分|秒|点|元|百|千|万|亿|位|辆";
  19. public $cn_number = "0|1|2|3|4|5|6|7|8|9|+|-|%|.|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s |t|u|v|w|x|y|z|A|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z";
  20. public $cn_sg_num = "一|二|三|四|五|六|七|八|九|十|百|千|万|亿|数";
  21. public $max_len = 13; //词典最大 7 中文字,这里的数值为字节数组的最大索引
  22. public $min_len = 3; //最小 2 中文字,这里的数值为字节数组的最大索引
  23. public $cn_two_name = "端木 南宫 谯笪 轩辕 令狐 钟离 闾丘 长孙 鲜于 宇文 司徒 司空 上官 欧阳 公孙 西门 东门 左丘 东郭 呼延 慕容 司马 夏侯 诸葛 东方 赫连 皇甫 尉迟 申屠";
  24. public $cn_one_name = "赵钱孙李周吴郑王冯陈褚卫蒋沈韩杨朱秦尤许何吕施张孔曹严华金魏陶姜戚谢邹喻柏水窦章云苏潘葛奚范彭郎鲁韦昌马苗凤花方俞任袁柳酆鲍史唐费廉岑薛雷贺倪汤滕殷罗毕郝邬安常乐于时傅皮卡齐康伍余元卜顾孟平黄穆萧尹姚邵堪汪祁毛禹狄米贝明臧计伏成戴谈宋茅庞熊纪舒屈项祝董粱杜阮蓝闵席季麻强贾路娄危江童颜郭梅盛林刁钟徐邱骆高夏蔡田樊胡凌霍虞万支柯咎管卢莫经房裘缪干解应宗宣丁贲邓郁单杭洪包诸左石崔吉钮龚程嵇邢滑裴陆荣翁荀羊於惠甄魏加封芮羿储靳汲邴糜松井段富巫乌焦巴弓牧隗谷车侯宓蓬全郗班仰秋仲伊宫宁仇栾暴甘钭厉戎祖武符刘姜詹束龙叶幸司韶郜黎蓟薄印宿白怀蒲台从鄂索咸籍赖卓蔺屠蒙池乔阴郁胥能苍双闻莘党翟谭贡劳逄姬申扶堵冉宰郦雍郤璩桑桂濮牛寿通边扈燕冀郏浦尚农温别庄晏柴翟阎充慕连茹习宦艾鱼容向古易慎戈廖庚终暨居衡步都耿满弘匡国文寇广禄阙东殴殳沃利蔚越夔隆师巩厍聂晁勾敖融冷訾辛阚那简饶空曾沙须丰巢关蒯相查后江游竺";
  25. function __construct($loaddic=true) {
  26. if($loaddic) {
  27. for($i=0;$i<strlen($this->cn_one_name);$i++){
  28. $this->one_name_dic[$this->cn_one_name[$i].$this->cn_one_name[$i+1]] = 1;
  29. $i++;
  30. }
  31. $twoname = explode(" ",$this->cn_two_name);
  32. foreach($twoname as $n){ $this->two_name_dic[$n] = 1; }
  33. unset($twoname);
  34. unset($this->cn_two_name);
  35. unset($this->cn_one_name);
  36. $dicfile = PC_PATH.'libs'.DIRECTORY_SEPARATOR.'data'.DIRECTORY_SEPARATOR.'dict'.DIRECTORY_SEPARATOR.'dict.csv';
  37. $fp = fopen($dicfile,'r');
  38. while($line = fgets($fp,64)){
  39. $ws = explode(' ',$line);
  40. $this->rank_dic[strlen($ws[0])][$ws[0]] = $ws[1];
  41. }
  42. fclose($fp);
  43. }
  44. }
  45. function clear() {
  46. unset($this->rank_dic);
  47. }
  48. function get_source($str) {
  49. if(CHARSET == 'utf-8') $str = iconv('utf-8','gbk',$str);
  50. $this->source_string = $str;
  51. $this->result_string = '';
  52. }
  53. function simple_split($str) {
  54. $this->source_string = $this->revise_string($str);
  55. return $this->source_string;
  56. }
  57. function split_result($str='',$try_num_name=true,$try_diff=true) {
  58. $str = trim($str);
  59. if($str!='') $this->get_source($str);
  60. else return '';
  61. $this->source_string = preg_replace('/ {1,}/',' ',$this->revise_string($this->source_string));
  62. $spwords = explode(' ',$this->source_string);
  63. $spLen = count($spwords) - 1;
  64. $spc = $this->split_char;
  65. for($i=$spLen;$i>=0;$i--){
  66. if(ord($spwords[$i][0])<33) continue;
  67. else if(!isset($spwords[$i][$this->min_len])) $this->result_string = $spwords[$i].$spc.$this->result_string;
  68. else if(ord($spwords[$i][0])<0x81){
  69. $this->result_string = $spwords[$i].$spc.$this->result_string;
  70. } else {
  71. $this->result_string = $this->split_mm($spwords[$i],$try_num_name,$try_diff).$spc.$this->result_string;
  72. }
  73. }
  74. if(CHARSET=='utf-8') $okstr = iconv('gbk','utf-8',$this->result_string);
  75. else $okstr = $this->result_string;
  76. return $okstr;
  77. }
  78. function par_number($str) {
  79. if($str == '') return '';
  80. $ws = explode(' ',$str);
  81. $wlen = count($ws);
  82. $spc = $this->split_char;
  83. $reStr = '';
  84. for($i=0;$i<$wlen;$i++){
  85. if($ws[$i]=='') continue;
  86. if($i>=$wlen-1) $reStr .= $spc.$ws[$i];
  87. else{ $reStr .= $spc.$ws[$i]; }
  88. }
  89. return $reStr;
  90. }
  91. function par_other($word_array) {
  92. $wlen = count($word_array)-1;
  93. $rsStr = '';
  94. $spc = $this->split_char;
  95. for($i=$wlen;$i>=0;$i--) {
  96. if(preg_match('/'.$this->cn_sg_num.'/',$word_array[$i])) {
  97. $rsStr .= $spc.$word_array[$i];
  98. if($i>0 && preg_match('/^'.$this->common_unit.'/',$word_array[$i-1]) ) {
  99. $rsStr .= $word_array[$i-1]; $i--;
  100. } else {
  101. while($i>0 && preg_match("/".$this->cn_sg_num."/",$word_array[$i-1]) ){ $rsStr .= $word_array[$i-1]; $i--; }
  102. }
  103. continue;
  104. }
  105. if(strlen($word_array[$i])==4 && isset($this->two_name_dic[$word_array[$i]])) {
  106. $rsStr .= $spc.$word_array[$i];
  107. if($i>0&&strlen($word_array[$i-1])==2){
  108. $rsStr .= $word_array[$i-1];$i--;
  109. if($i>0&&strlen($word_array[$i-1])==2){ $rsStr .= $word_array[$i-1];$i--; }
  110. }
  111. } else if(strlen($word_array[$i])==2 && isset($this->one_name_dic[$word_array[$i]])) {
  112. $rsStr .= $spc.$word_array[$i];
  113. if($i>0&&strlen($word_array[$i-1])==2){
  114. if(preg_match("/".$this->especial_char."/",$word_array[$i-1])) continue;
  115. $rsStr .= $word_array[$i-1];$i--;
  116. if($i>0 && strlen($word_array[$i-1])==2 &&
  117. !preg_match("/".$this->especial_char."/",$word_array[$i-1]))
  118. { $rsStr .= $word_array[$i-1];$i--; }
  119. }
  120. } else {
  121. $rsStr .= $spc.$word_array[$i];
  122. }
  123. }
  124. $rsStr = preg_replace("/^".$spc."/","",$rsStr);
  125. return $rsStr;
  126. }
  127. function split_mm($str,$try_num_name=true,$try_diff=true) {
  128. $spc = $this->split_char;
  129. $spLen = strlen($str);
  130. $rsStr = $okWord = $tmpWord = '';
  131. $word_array = array();
  132. for($i=($spLen-1);$i>=0;) {
  133. if($i<=$this->min_len){
  134. if($i==1){
  135. $word_array[] = substr($str,0,2);
  136. } else {
  137. $w = substr($str,0,$this->min_len+1);
  138. if($this->is_word($w)){
  139. $word_array[] = $w;
  140. }else{
  141. $word_array[] = substr($str,2,2);
  142. $word_array[] = substr($str,0,2);
  143. }
  144. }
  145. $i = -1; break;
  146. }
  147. if($i>=$this->max_len) $max_pos = $this->max_len;
  148. else $max_pos = $i;
  149. $isMatch = false;
  150. for($j=$max_pos;$j>=0;$j=$j-2){
  151. $w = substr($str,$i-$j,$j+1);
  152. if($this->is_word($w)){
  153. $word_array[] = $w;
  154. $i = $i-$j-1;
  155. $isMatch = true;
  156. break;
  157. }
  158. }
  159. if(!$isMatch){
  160. if($i>1) {
  161. $word_array[] = $str[$i-1].$str[$i];
  162. $i = $i-2;
  163. }
  164. }
  165. }//End For
  166. if($try_num_name) {
  167. $rsStr = $this->par_other($word_array);
  168. } else {
  169. $wlen = count($word_array)-1;
  170. for($i=$wlen;$i>=0;$i--){
  171. $rsStr .= $spc.$word_array[$i];
  172. }
  173. }
  174. if($try_diff) $rsStr = $this->test_diff(trim($rsStr));
  175. return $rsStr;
  176. }
  177. function auto_description($str,$keyword,$strlen) {
  178. $this->source_string = $this->revise_string($this->source_string);
  179. $spwords = explode(" ",$this->source_string);
  180. $keywords = explode(" ",$this->keywords);
  181. $regstr = "";
  182. foreach($keywords as $k=>$v) {
  183. if($v=="") continue;
  184. if(ord($v[0])>0x80 && strlen($v)<3) continue;
  185. if($regstr=="") $regstr .= "($v)";
  186. else $regstr .= "|($v)";
  187. }
  188. }
  189. function test_diff($str) {
  190. $str = preg_replace("/ {1,}/"," ",$str);
  191. if($str == ""||$str == " ") return "";
  192. $ws = explode(' ',$str);
  193. $wlen = count($ws);
  194. $spc = $this->split_char;
  195. $reStr = "";
  196. for($i=0;$i<$wlen;$i++) {
  197. if($i>=($wlen-1)) {
  198. $reStr .= $spc.$ws[$i];
  199. } else {
  200. if($ws[$i]==$ws[$i+1]){
  201. $reStr .= $spc.$ws[$i].$ws[$i+1];
  202. $i++; continue;
  203. }
  204. if(strlen($ws[$i])==2 && strlen($ws[$i+1])<8 && strlen($ws[$i+1])>2) {
  205. $addw = $ws[$i].$ws[$i+1];
  206. $t = 6;
  207. $testok = false;
  208. while($t>=4) {
  209. $w = substr($addw,0,$t);
  210. if($this->is_word($w) && ($this->get_rank($w) > $this->get_rank($ws[$i+1])*2) ) {
  211. $limit_word = substr($ws[$i+1],strlen($ws[$i+1])-$t-2,strlen($ws[$i+1])-strlen($w)+2);
  212. if($limit_word!="") $reStr .= $spc.$w.$spc.$limit_word;
  213. else $reStr .= $spc.$w;
  214. $testok = true;
  215. break;
  216. }
  217. $t = $t-2;
  218. }
  219. if(!$testok) $reStr .= $spc.$ws[$i];
  220. else $i++;
  221. } else if(strlen($ws[$i])>2 && strlen($ws[$i])<8 && strlen($ws[$i+1])>2 && strlen($ws[$i+1])<8) {
  222. $t21 = substr($ws[$i+1],0,2);
  223. $t22 = substr($ws[$i+1],0,4);
  224. if($this->is_word($ws[$i].$t21)) {
  225. if(strlen($ws[$i])==6||strlen($ws[$i+1])==6){
  226. $reStr .= $spc.$ws[$i].$t21.$spc.substr($ws[$i+1],2,strlen($ws[$i+1])-2);
  227. $i++;
  228. } else {
  229. $reStr .= $spc.$ws[$i];
  230. }
  231. } else if(strlen($ws[$i+1])==6) {
  232. if($this->is_word($ws[$i].$t22)) {
  233. $reStr .= $spc.$ws[$i].$t22.$spc.$ws[$i+1][4].$ws[$i+1][5];
  234. $i++;
  235. } else { $reStr .= $spc.$ws[$i]; }
  236. } else if(strlen($ws[$i+1])==4) {
  237. $addw = $ws[$i].$ws[$i+1];
  238. $t = strlen($ws[$i+1])-2;
  239. $testok = false;
  240. while($t>0) {
  241. $w = substr($addw,0,strlen($ws[$i])+$t);
  242. if($this->is_word($w) && ($this->get_rank($w) > $this->get_rank($ws[$i+1])*2) ) {
  243. $limit_word = substr($ws[$i+1],$t,strlen($ws[$i+1])-$t);
  244. if($limit_word!="") $reStr .= $spc.$w.$spc.$limit_word;
  245. else $reStr .= $spc.$w;
  246. $testok = true;
  247. break;
  248. }
  249. $t = $t-2;
  250. }
  251. if(!$testok) $reStr .= $spc.$ws[$i];
  252. else $i++;
  253. }else {
  254. $reStr .= $spc.$ws[$i];
  255. }
  256. } else {
  257. $reStr .= $spc.$ws[$i];
  258. }
  259. }
  260. }//End For
  261. return $reStr;
  262. }
  263. function is_word($okWord){
  264. $slen = strlen($okWord);
  265. if($slen > $this->max_len) return false;
  266. else return isset($this->rank_dic[$slen][$okWord]);
  267. }
  268. function revise_string($str) {
  269. $spc = $this->split_char;
  270. $slen = strlen($str);
  271. if($slen==0) return '';
  272. $okstr = '';
  273. $prechar = 0; // 0-空白 1-英文 2-中文 3-符号
  274. for($i=0;$i<$slen;$i++){
  275. if(ord($str[$i]) < 0x81) {
  276. if(ord($str[$i]) < 33){
  277. //$str[$i]!="\r"&&$str[$i]!="\n"
  278. if($prechar!=0) $okstr .= $spc;
  279. $prechar=0;
  280. continue;
  281. } else if(preg_match("/[^0-9a-zA-Z@\.%#:\\/\\&_-]/",$str[$i])) {
  282. if($prechar==0) {
  283. $okstr .= $str[$i]; $prechar=3;
  284. } else {
  285. $okstr .= $spc.$str[$i]; $prechar=3;
  286. }
  287. } else {
  288. if($prechar==2||$prechar==3) {
  289. $okstr .= $spc.$str[$i]; $prechar=1;
  290. } else {
  291. if(preg_match("/@#%:/",$str[$i])){ $okstr .= $str[$i]; $prechar=3; }
  292. else { $okstr .= $str[$i]; $prechar=1; }
  293. }
  294. }
  295. } else{
  296. if($prechar!=0 && $prechar!=2) $okstr .= $spc;
  297. if(isset($str[$i+1])){
  298. $c = $str[$i].$str[$i+1];
  299. if(preg_match("/".$this->cn_number."/",$c)) {
  300. $okstr .= $this->get_alab_num($c); $prechar = 2; $i++; continue;
  301. }
  302. $n = hexdec(bin2hex($c));
  303. if($n>0xA13F && $n < 0xAA40) {
  304. if($c=="《"){
  305. if($prechar!=0) $okstr .= $spc." 《";
  306. else $okstr .= " 《";
  307. $prechar = 2;
  308. } else if($c=="》"){
  309. $okstr .= "》 ";
  310. $prechar = 3;
  311. } else{
  312. if($prechar!=0) $okstr .= $spc.$c;
  313. else $okstr .= $c;
  314. $prechar = 3;
  315. }
  316. } else {
  317. $okstr .= $c;
  318. $prechar = 2;
  319. }
  320. $i++;
  321. }
  322. }//中文字符
  323. }//结束循环
  324. return $okstr;
  325. }
  326. function find_new_word($str,$maxlen=6) {
  327. $okstr = "";
  328. return $str;
  329. }
  330. function get_keyword($str,$ilen=-1) {
  331. if($str=='') return '';
  332. else $this->split_result($str,true,true);
  333. $okstr = $this->result_string;
  334. $ws = explode(' ',$okstr);
  335. $okstr = $wks = '';
  336. foreach($ws as $w) {
  337. $w = trim($w);
  338. if(strlen($w)<2) continue;
  339. if(!preg_match("/[^0-9:-]/",$w)) continue;
  340. if(strlen($w)==2&&ord($w[0])>0x80) continue;
  341. if(isset($wks[$w])) $wks[$w]++;
  342. else $wks[$w] = 1;
  343. }
  344. if(is_array($wks)) {
  345. arsort($wks);
  346. if($ilen==-1) {
  347. foreach($wks as $w=>$v) {
  348. if($this->get_rank($w)>500) $okstr .= $w." ";
  349. }
  350. } else {
  351. foreach($wks as $w=>$v){
  352. if((strlen($okstr)+strlen($w)+1)<$ilen) $okstr .= $w." ";
  353. else break;
  354. }
  355. }
  356. }
  357. if(CHARSET=='utf-8') $okstr = iconv('gbk','utf-8',$okstr);
  358. return trim($okstr);
  359. }
  360. function get_rank($w){
  361. if(isset($this->rank_dic[strlen($w)][$w])) return $this->rank_dic[strlen($w)][$w];
  362. else return 0;
  363. }
  364. function get_alab_num($fnum){
  365. $nums = array("0","1","2","3","4","5","6",
  366. "7","8","9","+","-","%",".",
  367. "a","b","c","d","e","f","g","h","i","j","k","l","m",
  368. "n","o","p","q","r","s ","t","u","v","w","x","y","z",
  369. "A","B","C","D","E","F","G","H","I","J","K","L","M",
  370. "N","O","P","Q","R","S","T","U","V","W","X","Y","Z");
  371. $fnums = "0123456789+-%.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
  372. $fnum = str_replace($nums,$fnums,$fnum);
  373. return $fnum;
  374. }
  375. }
  376. ?>