A file has 300,000 pieces of data! One data per line
Peer words! For example, post stop tops are peer words
How do you find out all the data in it
Please give me some ideas
Use linux commands to complete your requirements. Example
統(tǒng)計文件夾下包含Action( 數(shù)量
grep Action\( ~/www/pms/app/app/controllers/*.php | wc -l
My suggestion is to write a special sorting algorithm, and then use usort to sort, so that the same words are sorted together, and then output in order
The rough logic of the sorting algorithm is
int cmp($left, $right) {
//如果長度都不一致,直接放棄
if(strlen($left) != strlen($right))
return strcmp($left, $right);
//長度一致的,按照字符切分,統(tǒng)計,判斷是否一致
$arrleft = str_split($left);
$arrright = str_split($right);
$leftstat = array();
$rightstat = array();
foreach($arrleft as $char) {
if(array_key_exists($char, $leftstat))
$leftstat[$char]++;
else
$leftstat[$char]=0;
}
foreach($arrright as $char) {
//邏輯類似
}
//比較兩個數(shù)組的統(tǒng)計是否一致
if(count(array_diff_assoc($leftstat, $rightstat)) == 0)
return 0;
else
return strcmp($left, $right);
}
1. To sort 300,000 rows of data, use usort + the above cmp function
2. Traverse the sorted data from row 2 to the end, and judge whether this row is consistent with the previous row. Yes: output, no, go down.
Probably. Written by hand
<?php
/**
* 建立 tries Tree,存儲對應單詞,減少存儲量,加快檢索速度
* (T)代表是一個單詞
* (F)代表不是一個單詞
*
* hi
* his
* is
* root
* / \
* h (F) i(F)
* | |
* i (T) s(T)
* |
* s (T)
*/
class TreeNode
{
public $isStr;
public $next;
/**
* TreeNode constructor.
*
* 字符串為 a-z 組成,所以可以直接將大小寫字符,都存成小寫
* 0 - 26 對應 a - z
*/
public function __construct()
{
$this->isStr = false;
$this->next = [];
}
}
///構(gòu)建Tries Tree
class Helper
{
public $treeRoot;
public $debug = false;///此處開啟是否以字符為索引
public function __construct()
{
$this->treeRoot = new TreeNode();
}
/**
* @param $str
*/
public function insert($str)
{
$str = strtolower($str);///將所有的字符都作為小寫存儲
$node = $this->treeRoot;
for ($i = 0; $i < strlen($str); ++$i) {
$index = $this->char2index($str{$i});
// $index = $str{$i};
if (empty($node->next[$index])) {
$node->next[$index] = new TreeNode();
}
$node = $node->next[$index];
}
$node->isStr = true;
}
private function char2index($ch)
{
return ($this->debug) ? $ch : intval(ord($ch) - ord('a'));
}
private function index2char($index)
{
return ($this->debug) ? $index : chr($index + ord('a'));
}
/**
* 查找對應的字符串的同形詞
* @param $str
* @return array
*/
public function find($str)
{
$result = [];
$str = strtolower($str);///將所有的字符都作為小寫存儲
$nextStr = ''; ///從后向前,逐漸追加字符,查找對應的數(shù)據(jù)
for ($i = strlen($str) - 1; $i >= 0; --$i) {
///這里可以設置閾值,比如當需要找的字符串長度 > 2
/// if(strlen($nextStr) < 2) continue;
$nextStr = $str{$i} . $nextStr;
$result = array_merge($result, $this->getResult($nextStr));
}
return array_unique($result);
}
/**
* 找到對應字符串開頭的所有單詞
* @param $str
* @return array
*/
private function getResult($str)
{
$result = [];
$root = $this->treeRoot;
///先找到 tries 樹中,對應的節(jié)點,確定節(jié)點是否包含子節(jié)點
for ($i = 0; $i < strlen($str); ++$i) {
if (empty($root)) {
return $result;
}
$index = $this->char2index($str{$i});
$root = $root->next[$index];
}
///利用隊列遍歷Tries 樹,實現(xiàn) O(n) 檢索
$queue = new SplQueue();
///將節(jié)點,和字符起始點,記錄到數(shù)據(jù)中,后續(xù)取用
$next = ['node' => $root, 'str' => $str];
$queue->push($next);
while (!$queue->isEmpty()) {
$next = $queue->pop();
if ($next['node']->isStr) {///確定找到的是單詞后,記錄到結(jié)果集
$result[] = $next['str'];
}
///將下一個可能的結(jié)果集數(shù)組,放入到隊列中查找
if (!empty($next['node']->next)) {
foreach ($next['node']->next as $index => $item) {
$next = ['node' => $item, 'str' => $next['str'] . $this->index2char($index)];
$queue->push($next);
}
}
}
return $result;
}
}
$helper = new Helper();
$helper->insert("is");
$helper->insert("his");
$helper->insert("her");
$helper->insert('post');
$helper->insert('top');
$helper->insert('stop');
$result = $helper->find('post');
print_r($result);
$result = $helper->find('hi');
print_r($result);