PHP code example of caasdata / jieba-php
1. Go to this page and download the library: Download caasdata/jieba-php library . Choose the download type require .
2. Extract the ZIP file and open the index.php.
3. Add this code to the index.php.
<?php
require_once('vendor/autoload.php');
/* Start to develop here. Best regards https://php-download.com/ */
caasdata / jieba-php example snippets
composer
ini_set('memory_limit', '1024M');
/multi-array/Factory/MultiArrayFactory.php";
Fukuball\Jieba\Finalseg;
Jieba::init();
Finalseg::init();
$seg_list = Jieba::cut("怜香惜玉也得要看对象啊!");
var_dump($seg_list);
$seg_list = Jieba::cut("我来到北京清华大学", true);
var_dump($seg_list); #全模式
$seg_list = Jieba::cut("我来到北京清华大学", false);
var_dump($seg_list); #默認精確模式
$seg_list = Jieba::cut("他来到了网易杭研大厦");
var_dump($seg_list);
$seg_list = Jieba::cutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); #搜索引擎模式
var_dump($seg_list);
array(7) {
[0]=>
string(12) "怜香惜玉"
[1]=>
string(3) "也"
[2]=>
string(3) "得"
[3]=>
string(3) "要"
[4]=>
string(3) "看"
[5]=>
string(6) "对象"
[6]=>
string(3) "啊"
}
Full Mode:
array(15) {
[0]=>
string(3) "我"
[1]=>
string(3) "来"
[2]=>
string(6) "来到"
[3]=>
string(3) "到"
[4]=>
string(3) "北"
[5]=>
string(6) "北京"
[6]=>
string(3) "京"
[7]=>
string(3) "清"
[8]=>
string(6) "清华"
[9]=>
string(12) "清华大学"
[10]=>
string(3) "华"
[11]=>
string(6) "华大"
[12]=>
string(3) "大"
[13]=>
string(6) "大学"
[14]=>
string(3) "学"
}
Default Mode:
array(4) {
[0]=>
string(3) "我"
[1]=>
string(6) "来到"
[2]=>
string(6) "北京"
[3]=>
string(12) "清华大学"
}
array(6) {
[0]=>
string(3) "他"
[1]=>
string(6) "来到"
[2]=>
string(3) "了"
[3]=>
string(6) "网易"
[4]=>
string(6) "杭研"
[5]=>
string(6) "大厦"
}
(此處,“杭研“並沒有在詞典中,但是也被 Viterbi 算法識別出來了)
Search Engine Mode:
array(18) {
[0]=>
string(6) "小明"
[1]=>
string(6) "硕士"
[2]=>
string(6) "毕业"
[3]=>
string(3) "于"
[4]=>
string(6) "中国"
[5]=>
string(6) "科学"
[6]=>
string(6) "学院"
[7]=>
string(9) "科学院"
[8]=>
string(15) "中国科学院"
[9]=>
string(6) "计算"
[10]=>
string(9) "计算所"
[11]=>
string(3) "后"
[12]=>
string(3) "在"
[13]=>
string(6) "日本"
[14]=>
string(6) "京都"
[15]=>
string(6) "大学"
[16]=>
string(18) "日本京都大学"
[17]=>
string(6) "深造"
}
ini_set('memory_limit', '600M');
r/multi-array/Factory/MultiArrayFactory.php";
ass/JiebaAnalyse.php";
use Fukuball\Jieba\Jieba;
use Fukuball\Jieba\Finalseg;
use Fukuball\Jieba\JiebaAnalyse;
Jieba::init(array('mode'=>'test','dict'=>'small'));
Finalseg::init();
JiebaAnalyse::init();
$top_k = 10;
$content = file_get_contents("/path/to/your/dict/lyric.txt", "r");
$tags = JiebaAnalyse::extractTags($content, $top_k);
var_dump($tags);
JiebaAnalyse::setStopWords('/path/to/your/dict/stop_words.txt');
$tags = JiebaAnalyse::extractTags($content, $top_k);
var_dump($tags);
array(10) {
'沒有' =>
double(1.0592831964595)
'所謂' =>
double(0.90795702553671)
'是否' =>
double(0.66385043195443)
'一般' =>
double(0.54607060161899)
'雖然' =>
double(0.30265234184557)
'來說' =>
double(0.30265234184557)
'肌迫' =>
double(0.30265234184557)
'退縮' =>
double(0.30265234184557)
'矯作' =>
double(0.30265234184557)
'怯懦' =>
double(0.24364586159392)
}
array(10) {
'所謂' =>
double(1.1569129841516)
'一般' =>
double(0.69579963754677)
'矯作' =>
double(0.38563766138387)
'來說' =>
double(0.38563766138387)
'退縮' =>
double(0.38563766138387)
'雖然' =>
double(0.38563766138387)
'肌迫' =>
double(0.38563766138387)
'怯懦' =>
double(0.31045198493419)
'隨便說說' =>
double(0.19281883069194)
'一場' =>
double(0.19281883069194)
}
ini_set('memory_limit', '600M');
LE__))."/vendor/multi-array/Factory/MultiArrayFactory.php";
_FILE__))."/class/Posseg.php";
use Fukuball\Jieba\Jieba;
use Fukuball\Jieba\Finalseg;
use Fukuball\Jieba\Posseg;
Jieba::init();
Finalseg::init();
Posseg::init();
$seg_list = Posseg::cut("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。");
var_dump($seg_list);
array(21) {
[0]=>
array(2) {
["word"]=>
string(3) "这"
["tag"]=>
string(1) "r"
}
[1]=>
array(2) {
["word"]=>
string(3) "是"
["tag"]=>
string(1) "v"
}
[2]=>
array(2) {
["word"]=>
string(6) "一个"
["tag"]=>
string(1) "m"
}
[3]=>
array(2) {
["word"]=>
string(18) "伸手不见五指"
["tag"]=>
string(1) "i"
}
[4]=>
array(2) {
["word"]=>
string(3) "的"
["tag"]=>
string(2) "uj"
}
[5]=>
array(2) {
["word"]=>
string(6) "黑夜"
["tag"]=>
string(1) "n"
}
[6]=>
array(2) {
["word"]=>
string(3) "。"
["tag"]=>
string(1) "x"
}
[7]=>
array(2) {
["word"]=>
string(3) "我"
["tag"]=>
string(1) "r"
}
[8]=>
array(2) {
["word"]=>
string(3) "叫"
["tag"]=>
string(1) "v"
}
[9]=>
array(2) {
["word"]=>
string(9) "孙悟空"
["tag"]=>
string(2) "nr"
}
[10]=>
array(2) {
["word"]=>
string(3) ","
["tag"]=>
string(1) "x"
}
[11]=>
array(2) {
["word"]=>
string(3) "我"
["tag"]=>
string(1) "r"
}
[12]=>
array(2) {
["word"]=>
string(3) "爱"
["tag"]=>
string(1) "v"
}
[13]=>
array(2) {
["word"]=>
string(6) "北京"
["tag"]=>
string(2) "ns"
}
[14]=>
array(2) {
["word"]=>
string(3) ","
["tag"]=>
string(1) "x"
}
[15]=>
array(2) {
["word"]=>
string(3) "我"
["tag"]=>
string(1) "r"
}
[16]=>
array(2) {
["word"]=>
string(3) "爱"
["tag"]=>
string(1) "v"
}
[17]=>
array(2) {
["word"]=>
string(6) "Python"
["tag"]=>
string(3) "eng"
}
[18]=>
array(2) {
["word"]=>
string(3) "和"
["tag"]=>
string(1) "c"
}
[19]=>
array(2) {
["word"]=>
string(3) "C++"
["tag"]=>
string(3) "eng"
}
[20]=>
array(2) {
["word"]=>
string(3) "。"
["tag"]=>
string(1) "x"
}
}
ini_set('memory_limit', '1024M');
E__))."/vendor/multi-array/Factory/MultiArrayFactory.php";
Fukuball\Jieba\Finalseg;
Jieba::init(array('mode'=>'default','dict'=>'big'));
Finalseg::init();
$seg_list = Jieba::cut("怜香惜玉也得要看对象啊!");
var_dump($seg_list);
$seg_list = Jieba::cut("憐香惜玉也得要看對象啊!");
var_dump($seg_list);
array(7) {
[0]=>
string(12) "怜香惜玉"
[1]=>
string(3) "也"
[2]=>
string(3) "得"
[3]=>
string(3) "要"
[4]=>
string(3) "看"
[5]=>
string(6) "对象"
[6]=>
string(3) "啊"
}
array(7) {
[0]=>
string(12) "憐香惜玉"
[1]=>
string(3) "也"
[2]=>
string(3) "得"
[3]=>
string(3) "要"
[4]=>
string(3) "看"
[5]=>
string(6) "對象"
[6]=>
string(3) "啊"
}
ini_set('memory_limit', '1024M');
E__))."/vendor/multi-array/Factory/MultiArrayFactory.php";
Fukuball\Jieba\Finalseg;
Jieba::init(array('mode'=>'default','dict'=>'big'));
Finalseg::init();
$seg_list = Jieba::cut("怜香惜玉也得要看对象啊!");
var_dump($seg_list);
$seg_list = Jieba::cut("憐香惜玉也得要看對象啊!");
var_dump($seg_list);
array(7) {
[0]=>
string(12) "怜香惜玉"
[1]=>
string(3) "也"
[2]=>
string(3) "得"
[3]=>
string(3) "要"
[4]=>
string(3) "看"
[5]=>
string(6) "对象"
[6]=>
string(3) "啊"
}
array(7) {
[0]=>
string(12) "憐香惜玉"
[1]=>
string(3) "也"
[2]=>
string(3) "得"
[3]=>
string(3) "要"
[4]=>
string(3) "看"
[5]=>
string(6) "對象"
[6]=>
string(3) "啊"
}
ini_set('memory_limit', '1024M');
E__))."/vendor/multi-array/Factory/MultiArrayFactory.php";
Fukuball\Jieba\Finalseg;
Jieba::init(array('cjk'=>'all'));
Finalseg::init();
$seg_list = Jieba::cut("한국어 또는 조선말은 제주특별자치도를 제외한 한반도 및 그 부속 도서와 한민족 거주 지역에서 쓰이는 언어로");
var_dump($seg_list);
$seg_list = Jieba::cut("日本語は、主に日本国内や日本人同士の間で使われている言語である。");
var_dump($seg_list);
// 加载日语词库可以对日语进行简单的分词
Jieba::loadUserDict("/path/to/your/japanese/dict.txt");
$seg_list = Jieba::cut("日本語は、主に日本国内や日本人同士の間で使われている言語である。");
var_dump($seg_list);
array(15) {
[0]=>
string(9) "한국어"
[1]=>
string(6) "또는"
[2]=>
string(12) "조선말은"
[3]=>
string(24) "제주특별자치도를"
[4]=>
string(9) "제외한"
[5]=>
string(9) "한반도"
[6]=>
string(3) "및"
[7]=>
string(3) "그"
[8]=>
string(6) "부속"
[9]=>
string(9) "도서와"
[10]=>
string(9) "한민족"
[11]=>
string(6) "거주"
[12]=>
string(12) "지역에서"
[13]=>
string(9) "쓰이는"
[14]=>
string(9) "언어로"
}
array(21) {
[0]=>
string(6) "日本"
[1]=>
string(3) "語"
[2]=>
string(3) "は"
[3]=>
string(3) "主"
[4]=>
string(3) "に"
[5]=>
string(6) "日本"
[6]=>
string(6) "国内"
[7]=>
string(3) "や"
[8]=>
string(6) "日本"
[9]=>
string(3) "人"
[10]=>
string(6) "同士"
[11]=>
string(3) "の"
[12]=>
string(3) "間"
[13]=>
string(3) "で"
[14]=>
string(3) "使"
[15]=>
string(3) "わ"
[16]=>
string(6) "れて"
[17]=>
string(6) "いる"
[18]=>
string(6) "言語"
[19]=>
string(3) "で"
[20]=>
string(6) "ある"
}
array(17) {
[0]=>
string(9) "日本語"
[1]=>
string(3) "は"
[2]=>
string(6) "主に"
[3]=>
string(9) "日本国"
[4]=>
string(3) "内"
[5]=>
string(3) "や"
[6]=>
string(9) "日本人"
[7]=>
string(6) "同士"
[8]=>
string(3) "の"
[9]=>
string(3) "間"
[10]=>
string(3) "で"
[11]=>
string(3) "使"
[12]=>
string(3) "わ"
[13]=>
string(6) "れて"
[14]=>
string(6) "いる"
[15]=>
string(6) "言語"
[16]=>
string(9) "である"
}
ini_set('memory_limit', '1024M');
E__))."/vendor/multi-array/Factory/MultiArrayFactory.php";
Fukuball\Jieba\Finalseg;
Jieba::init(array('mode'=>'test','dict'=>'big'));
Finalseg::init();
$seg_list = Jieba::tokenize("永和服装饰品有限公司");
var_dump($seg_list);
array(4) {
[0] =>
array(3) {
'word' =>
string(6) "永和"
'start' =>
int(0)
'end' =>
int(2)
}
[1] =>
array(3) {
'word' =>
string(6) "服装"
'start' =>
int(2)
'end' =>
int(4)
}
[2] =>
array(3) {
'word' =>
string(6) "饰品"
'start' =>
int(4)
'end' =>
int(6)
}
[3] =>
array(3) {
'word' =>
string(12) "有限公司"
'start' =>
int(6)
'end' =>
int(10)
}
}
ini_set('memory_limit', '1024M');
/multi-array/Factory/MultiArrayFactory.php";
Fukuball\Jieba\Finalseg;
Jieba::init();
Finalseg::init();
$seg_list = Jieba::cut("怜香惜玉也得要看对象啊!");
var_dump($seg_list);
seg_list = jieba.cut("我来到北京清华大学", true)
var_dump($seg_list); #全模式
seg_list = jieba.cut("我来到北京清华大学", false)
var_dump($seg_list); #默認精確模式
seg_list = jieba.cut("他来到了网易杭研大厦")
var_dump($seg_list);
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") #搜索引擎模式
var_dump($seg_list);
array(7) {
[0]=>
string(12) "怜香惜玉"
[1]=>
string(3) "也"
[2]=>
string(3) "得"
[3]=>
string(3) "要"
[4]=>
string(3) "看"
[5]=>
string(6) "对象"
[6]=>
string(3) "啊"
}
Full Mode:
array(15) {
[0]=>
string(3) "我"
[1]=>
string(3) "来"
[2]=>
string(6) "来到"
[3]=>
string(3) "到"
[4]=>
string(3) "北"
[5]=>
string(6) "北京"
[6]=>
string(3) "京"
[7]=>
string(3) "清"
[8]=>
string(6) "清华"
[9]=>
string(12) "清华大学"
[10]=>
string(3) "华"
[11]=>
string(6) "华大"
[12]=>
string(3) "大"
[13]=>
string(6) "大学"
[14]=>
string(3) "学"
}
Default Mode:
array(4) {
[0]=>
string(3) "我"
[1]=>
string(6) "来到"
[2]=>
string(6) "北京"
[3]=>
string(12) "清华大学"
}
array(6) {
[0]=>
string(3) "他"
[1]=>
string(6) "来到"
[2]=>
string(3) "了"
[3]=>
string(6) "网易"
[4]=>
string(6) "杭研"
[5]=>
string(6) "大厦"
}
(此處,“杭研“並沒有在詞典中,但是也被 Viterbi 算法識別出來了)
Search Engine Mode:
array(18) {
[0]=>
string(6) "小明"
[1]=>
string(6) "硕士"
[2]=>
string(6) "毕业"
[3]=>
string(3) "于"
[4]=>
string(6) "中国"
[5]=>
string(6) "科学"
[6]=>
string(6) "学院"
[7]=>
string(9) "科学院"
[8]=>
string(15) "中国科学院"
[9]=>
string(6) "计算"
[10]=>
string(9) "计算所"
[11]=>
string(3) "后"
[12]=>
string(3) "在"
[13]=>
string(6) "日本"
[14]=>
string(6) "京都"
[15]=>
string(6) "大学"
[16]=>
string(18) "日本京都大学"
[17]=>
string(6) "深造"
}
ini_set('memory_limit', '600M');
r/multi-array/Factory/MultiArrayFactory.php";
ass/JiebaAnalyse.php";
use Fukuball\Jieba\Jieba;
use Fukuball\Jieba\Finalseg;
use Fukuball\Jieba\JiebaAnalyse;
Jieba::init(array('mode'=>'test','dict'=>'small'));
Finalseg::init();
JiebaAnalyse::init();
$top_k = 10;
$content = file_get_contents("/path/to/your/dict/lyric.txt", "r");
$tags = JiebaAnalyse::extractTags($content, $top_k);
var_dump($tags);
array(10) {
["是否"]=>
float(1.2196321889395)
["一般"]=>
float(1.0032459890209)
["肌迫"]=>
float(0.64654314660465)
["怯懦"]=>
float(0.44762844339349)
["藉口"]=>
float(0.32327157330233)
["逼不得已"]=>
float(0.32327157330233)
["不安全感"]=>
float(0.26548304656279)
["同感"]=>
float(0.23929673812326)
["有把握"]=>
float(0.21043366018744)
["空洞"]=>
float(0.20598261709442)
}
ini_set('memory_limit', '600M');
LE__))."/vendor/multi-array/Factory/MultiArrayFactory.php";
_FILE__))."/class/Posseg.php";
use Fukuball\Jieba\Jieba;
use Fukuball\Jieba\Finalseg;
use Fukuball\Jieba\Posseg;
Jieba::init();
Finalseg::init();
Posseg::init();
$seg_list = Posseg::cut("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。");
var_dump($seg_list);
array(21) {
[0]=>
array(2) {
["word"]=>
string(3) "这"
["tag"]=>
string(1) "r"
}
[1]=>
array(2) {
["word"]=>
string(3) "是"
["tag"]=>
string(1) "v"
}
[2]=>
array(2) {
["word"]=>
string(6) "一个"
["tag"]=>
string(1) "m"
}
[3]=>
array(2) {
["word"]=>
string(18) "伸手不见五指"
["tag"]=>
string(1) "i"
}
[4]=>
array(2) {
["word"]=>
string(3) "的"
["tag"]=>
string(2) "uj"
}
[5]=>
array(2) {
["word"]=>
string(6) "黑夜"
["tag"]=>
string(1) "n"
}
[6]=>
array(2) {
["word"]=>
string(3) "。"
["tag"]=>
string(1) "w"
}
[7]=>
array(2) {
["word"]=>
string(3) "我"
["tag"]=>
string(1) "r"
}
[8]=>
array(2) {
["word"]=>
string(3) "叫"
["tag"]=>
string(1) "v"
}
[9]=>
array(2) {
["word"]=>
string(9) "孙悟空"
["tag"]=>
string(2) "nr"
}
[10]=>
array(2) {
["word"]=>
string(3) ","
["tag"]=>
string(1) "w"
}
[11]=>
array(2) {
["word"]=>
string(3) "我"
["tag"]=>
string(1) "r"
}
[12]=>
array(2) {
["word"]=>
string(3) "爱"
["tag"]=>
string(1) "v"
}
[13]=>
array(2) {
["word"]=>
string(6) "北京"
["tag"]=>
string(2) "ns"
}
[14]=>
array(2) {
["word"]=>
string(3) ","
["tag"]=>
string(1) "w"
}
[15]=>
array(2) {
["word"]=>
string(3) "我"
["tag"]=>
string(1) "r"
}
[16]=>
array(2) {
["word"]=>
string(3) "爱"
["tag"]=>
string(1) "v"
}
[17]=>
array(2) {
["word"]=>
string(6) "Python"
["tag"]=>
string(3) "eng"
}
[18]=>
array(2) {
["word"]=>
string(3) "和"
["tag"]=>
string(1) "c"
}
[19]=>
array(2) {
["word"]=>
string(3) "C++"
["tag"]=>
string(3) "eng"
}
[20]=>
array(2) {
["word"]=>
string(3) "。"
["tag"]=>
string(1) "w"
}
}
ini_set('memory_limit', '1024M');
E__))."/vendor/multi-array/Factory/MultiArrayFactory.php";
Fukuball\Jieba\Finalseg;
Jieba::init(array('mode'=>'default','dict'=>'big'));
Finalseg::init();
$seg_list = Jieba::cut("怜香惜玉也得要看对象啊!");
var_dump($seg_list);
$seg_list = Jieba::cut("憐香惜玉也得要看對象啊!");
var_dump($seg_list);
array(7) {
[0]=>
string(12) "怜香惜玉"
[1]=>
string(3) "也"
[2]=>
string(3) "得"
[3]=>
string(3) "要"
[4]=>
string(3) "看"
[5]=>
string(6) "对象"
[6]=>
string(3) "啊"
}
array(7) {
[0]=>
string(12) "憐香惜玉"
[1]=>
string(3) "也"
[2]=>
string(3) "得"
[3]=>
string(3) "要"
[4]=>
string(3) "看"
[5]=>
string(6) "對象"
[6]=>
string(3) "啊"
}
ini_set('memory_limit', '1024M');
E__))."/vendor/multi-array/Factory/MultiArrayFactory.php";
Fukuball\Jieba\Finalseg;
Jieba::init(array('cjk'=>'all'));
Finalseg::init();
$seg_list = Jieba::cut("한국어 또는 조선말은 제주특별자치도를 제외한 한반도 및 그 부속 도서와 한민족 거주 지역에서 쓰이는 언어로");
var_dump($seg_list);
$seg_list = Jieba::cut("日本語は、主に日本国内や日本人同士の間で使われている言語である。");
var_dump($seg_list);
// Loading custom Japanese dictionary can do a simple word segmentation
Jieba::loadUserDict("/path/to/your/japanese/dict.txt");
$seg_list = Jieba::cut("日本語は、主に日本国内や日本人同士の間で使われている言語である。");
var_dump($seg_list);
array(15) {
[0]=>
string(9) "한국어"
[1]=>
string(6) "또는"
[2]=>
string(12) "조선말은"
[3]=>
string(24) "제주특별자치도를"
[4]=>
string(9) "제외한"
[5]=>
string(9) "한반도"
[6]=>
string(3) "및"
[7]=>
string(3) "그"
[8]=>
string(6) "부속"
[9]=>
string(9) "도서와"
[10]=>
string(9) "한민족"
[11]=>
string(6) "거주"
[12]=>
string(12) "지역에서"
[13]=>
string(9) "쓰이는"
[14]=>
string(9) "언어로"
}
array(21) {
[0]=>
string(6) "日本"
[1]=>
string(3) "語"
[2]=>
string(3) "は"
[3]=>
string(3) "主"
[4]=>
string(3) "に"
[5]=>
string(6) "日本"
[6]=>
string(6) "国内"
[7]=>
string(3) "や"
[8]=>
string(6) "日本"
[9]=>
string(3) "人"
[10]=>
string(6) "同士"
[11]=>
string(3) "の"
[12]=>
string(3) "間"
[13]=>
string(3) "で"
[14]=>
string(3) "使"
[15]=>
string(3) "わ"
[16]=>
string(6) "れて"
[17]=>
string(6) "いる"
[18]=>
string(6) "言語"
[19]=>
string(3) "で"
[20]=>
string(6) "ある"
}
array(17) {
[0]=>
string(9) "日本語"
[1]=>
string(3) "は"
[2]=>
string(6) "主に"
[3]=>
string(9) "日本国"
[4]=>
string(3) "内"
[5]=>
string(3) "や"
[6]=>
string(9) "日本人"
[7]=>
string(6) "同士"
[8]=>
string(3) "の"
[9]=>
string(3) "間"
[10]=>
string(3) "で"
[11]=>
string(3) "使"
[12]=>
string(3) "わ"
[13]=>
string(6) "れて"
[14]=>
string(6) "いる"
[15]=>
string(6) "言語"
[16]=>
string(9) "である"
}