-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathwork.php
152 lines (116 loc) · 4.08 KB
/
work.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
<?php
require __DIR__ . '/vendor/autoload.php';
use Webpatser\Uuid\Uuid;
class work
{
//是否开启过滤
protected $_ifFilter = true;
//分段
protected $_section = 3;
public function run()
{
$dirPath = dirname(__FILE__);
$sourceFilePath = $dirPath . '/chinese-poetry/json/';
$sqlFileName = "/chinese-poetry-%s.sql";
$sqlPathString = $dirPath . $sqlFileName;
//判断古诗词仓库是否存在
$isPathExist = file_exists($sourceFilePath);
if ($isPathExist == false) {
die('古诗词仓库不存在,请按说明下载');
}
//唐诗json文件的路径
$tangFilePathList = glob("{$sourceFilePath}poet.tang.*.json");
if (empty($tangFilePathList)) {
die('路径不存在');
}
//每一个文件包含多少个json文件的数据
$eachFileLong = ceil(count($tangFilePathList) / $this->_section);
for ($i = 1; $i <= $this->_section; $i++) {
file_put_contents(sprintf($sqlPathString, $i), "INSERT INTO `tb_poems` (`id`, `title`, `author`, `content`,`create_time`) VALUES \r\n");
}
$id = 0;
$converter = new \Woodylan\Converter\Converter();
$oldNumber = 0;
foreach ($tangFilePathList as $fileCount => $filePath) {
$fileContent = file_get_contents($filePath);
$fileContentArray = json_decode($fileContent, true);
$fileNumber = floor($fileCount / $eachFileLong) + 1;
$sqlPath = sprintf($sqlPathString, (string)$fileNumber);
$content = '';
foreach ($fileContentArray as $value) {
//过滤
if ($this->_ifFilter) {
$isAllow = $this->filter($value['paragraphs']);
if ($isAllow == false) {
continue;
}
}
$paragraphs = implode($value['paragraphs'], '\n');
//过滤掉乱码的诗词
if ($this->stringInArray($paragraphs, ['□'])) {
continue;
}
$paragraphs = $converter->turn($paragraphs);
$id++;
//给上一行加入逗号
if ($oldNumber == $fileNumber) {
$content .= ",\r\n";
}
$oldNumber = $fileNumber;
$uuid = $this->createUuid();
$time = time();
$content .= "(\"{$uuid}\",\"{$value['title']}\",\"{$value['author']}\",\"{$paragraphs}\",{$time})";
}
$handle = fopen($sqlPath, 'a+');
fwrite($handle, $content);
fclose($handle);
}
//最后一行添加分号
for ($i = 1; $i <= $this->_section; $i++) {
$handle = fopen(sprintf($sqlPathString, $i), 'a+');
fwrite($handle, ';');
fclose($handle);
}
}
//过滤脚本
public function filter($paragraphs, $sentenceLength = 2, $charLength = 16)
{
if (count($paragraphs) != $sentenceLength) {
return false;
}
//判断每句是否长短一样
foreach ($paragraphs as $key => $value) {
$length = strlen($value);
if ($key >= 1) {
//判断跟上一个元素长度是否相等
if (strlen($paragraphs[$key - 1]) != $length) {
return false;
}
}
if ($length > $charLength * 3) {
return false;
}
}
return true;
}
public function createUuid($short = true)
{
$uuid = str_replace('-', '', Uuid::generate()->string);
if ($short) {
$uuid = substr($uuid, 8, 16);
}
return $uuid;
}
public function stringInArray($string, array $array)
{
foreach ($array as $value) {
if (strpos($string, $value)) {
return true;
}
}
return false;
}
}
//自动运行
$class = new work();
$class->run();