201807 - 技术之王

智能文章系统实战-统计数据展示Pyhton版本(17)

发布于：2018-7-16 18:32　作者：admin　浏览：2841　

1. 查看数据

2. 图形代码

#!/usr/bin/python3
# -*- coding: utf-8 -*-


#引入库
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import pymysql
from matplotlib.font_manager import *

#定义自定义字体，文件名从1.b查看系统中文字体中来
myfont = FontProperties(fname='/usr/share/fonts/stix/simsun.ttc')
#解决负号'-'显示为方块的问题
matplotlib.rcParams['axes.unicode_minus']=False

#初始化变量
x=[]
y1=[]
y2=[]



# 打开数据库连接
db = pymysql.connect("localhost","root","","article" )
 
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()
 
# SQL 查询语句
sql = "SELECT stat_date,pv,ip FROM stat ORDER BY id DESC LIMIT 0,10"
try:
	# 执行SQL语句
	cursor.execute(sql)

	# 获取所有记录列表
	results = cursor.fetchall()
	for row in results:
		x.append(row[0])
		y1.append(row[1])
		y2.append(row[2])

	# 打印结果
	#print(x)
	#print(y1)
	#print(y2)
	
except:
	print ("Error!")
 
 
# 关闭数据库连接
db.close()



#绘制图形
plt.figure(figsize=(12,8))
plt.plot(x,y1,label='PV',color='r',marker='o')
plt.plot(x,y2,label='PV',color='b',marker='s')
plt.xlabel(u'日期',fontproperties=myfont)
plt.ylabel(u'统计数量',fontproperties=myfont)
plt.title(u'数据统计',fontproperties=myfont)
plt.xticks(rotation=0)
# 设置数字标签
for a, b in zip(x, y1):
    plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
for a, b in zip(x, y2):
    plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
plt.legend()
#plt.show()
plt.savefig("stat.png")

3.显示图形

智能文章系统实战-统计数据展示(16)

发布于：2018-7-13 15:58　作者：admin　浏览：2960　

1. 查看统计数据

2. 统计代码（PHP+JS）

<?php
header("Content-Type:text/html;charset=utf-8");
error_reporting(0);

//变量初始化
$title="数据统计";
$labelsArray=array();
$pvArray=array();
$ipArray=array();

//查询统计表数据
$mysqli = new mysqli('localhost', 'root', '', 'article');
if ($mysqli->connect_errno) {
    printf("数据库连接错误！");
    exit();
}

$sql="SELECT * FROM stat ORDER BY id DESC LIMIT 10";
$result = $mysqli->query($sql);
if($result)
{
	while($row = $result->fetch_array(MYSQLI_ASSOC))
	{
	  $labelsArray[]=$row['stat_date'];
	  $pvArray[]=$row['pv'];
	  $ipArray[]=$row['ip'];
	}
}
$mysqli->close();


//合成统计图形需要的数据
$labelsStr="";
$pvStr=0;
$ipStr=0;
if($labelsArray)
{
	$labelsStr="'".implode("','",array_reverse($labelsArray))."'";
}
if($pvArray)
{
	$pvStr=implode(",",array_reverse($pvArray));
}
if($ipArray)
{
	$ipStr=implode(",",array_reverse($ipArray));
}
?>
<!DOCTYPE html>
<html lang="zh-CN">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="content-type" content="text/html; charset=utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"/>
    <meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0, maximum-scale=1.0, user-scalable=no">
    <script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/2.7.2/Chart.bundle.js"></script>
	<title>数据统计</title>
</head>
<body>
<div>
<body>
	<div style="width:70%;" >
		<canvas id="canvas" style="text-align:center;"></canvas>
	</div>
	<br>
	<br>
	
	<script>
		
		var config = {
			type: 'line',
			data: {
				labels: [<?=$labelsStr?>],
				datasets: [{
					label: 'PV',
					backgroundColor: "#FF0000",
					borderColor: "#FF0000",
					data: [<?=$pvStr?>],
					fill: false,
				}, {
					label: 'IP',
					fill: false,
					backgroundColor: "#00FF00",
					borderColor: "#00FF00",
					data: [<?=$ipStr?>],
				}]
			},
			options: {
				responsive: true,
				title: {
					display: true,
					text: '数据统计'
				}
			}
		};

		window.onload = function() {
			var ctx = document.getElementById('canvas').getContext('2d');
			new Chart(ctx, config);
		};

		
	</script>
</body>
</html>

3.查看统计图

智能文章系统实战-Hive数据仓库(15)

发布于：2018-7-12 16:07　作者：admin　浏览：3184　

1. hive 安装

http://www.wangfeilong.cn/server/118.html

2.查看上节用Hadoop清洗过的数据pvlog.txt

[root@localhost hive]# cat /tmp/pvlog.txt 
192.168.100.1	1530460833	http://news.demo.com/h5.php?action=show&id=89
192.168.100.1	1530460803	http://news.demo.com/h5.php?action=show&id=128

3启动HIVE并且创建数据库

[root@localhost hive]# hive
which: no hbase in (/usr/local/soft/hive/bin:/usr/local/soft/Hadoop/hadoop/bin:/usr/local/soft/Hadoop/hadoop/sbin:/usr/local/soft/jdk1.8.0_17/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/root/bin)
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/local/soft/hive/lib/log4j-slf4j-impl-2.6.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/local/soft/Hadoop/hadoop/share/hadoop/common/lib/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]

Logging initialized using configuration in jar:file:/usr/local/soft/hive/lib/hive-common-2.3.3.jar!/hive-log4j2.properties Async: true
Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.

hive> CREATE DATABASE IF NOT EXISTS article;
OK
Time taken: 7.758 seconds

hive> show databases;
OK
article
default
demo
wordcount
Time taken: 0.229 seconds, Fetched: 4 row(s)


hive> use article;
OK
Time taken: 0.068 seconds

4.hive详细日志表

4.1 创建明细表

hive> create table pvlog(
    > ip string,
    > times string, 
    > url string)
    > PARTITIONED BY (stat_date string) 
    > row format delimited fields terminated by '\t' stored as textfile;
OK
Time taken: 0.582 seconds

4.2 加载文本数据到HIVE数据库表

hive> load data local inpath '/tmp/pvlog.txt' overwrite into table pvlog partition(stat_date='2018-07-01');
Loading data to table article.pvlog partition (stat_date=2018-07-01)
OK
Time taken: 2.383 seconds

4.3 查询明细表数据

hive> select * from pvlog where stat_date = '2018-07-01';
OK
192.168.100.1	1530460833	http://news.demo.com/h5.php?action=show&id=89	2018-07-01
192.168.100.1	1530460803	http://news.demo.com/h5.php?action=show&id=128	2018-07-01
Time taken: 4.96 seconds, Fetched: 2 row(s)

5.统计数据

5.1 创建统计表

hive> create table stat(
    > stat_date string,
    > pv int, 
    > ip int
    > )
    > row format delimited fields terminated by '\t' stored as textfile;
OK
Time taken: 0.26 seconds

5.2 统计数据

hive> insert into stat   
    > select stat_date,count(*) as pv,count(distinct(ip)) as ip from pvlog where stat_date = '2018-07-01' group by stat_date;
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
Query ID = root_20180710175116_136d9e36-a8fc-4d0d-9f91-93dd71aba321
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
  set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
  set mapreduce.job.reduces=<number>
Starting Job = job_1531202649478_0010, Tracking URL = http://localhost:8088/proxy/application_1531202649478_0010/
Kill Command = /usr/local/soft/Hadoop/hadoop/bin/hadoop job  -kill job_1531202649478_0010
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 1
2018-07-10 17:51:36,560 Stage-1 map = 0%,  reduce = 0%
2018-07-10 17:51:52,289 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 2.23 sec
2018-07-10 17:52:07,262 Stage-1 map = 100%,  reduce = 100%, Cumulative CPU 5.86 sec
MapReduce Total cumulative CPU time: 5 seconds 860 msec
Ended Job = job_1531202649478_0010
Loading data to table article.stat
MapReduce Jobs Launched: 
Stage-Stage-1: Map: 1  Reduce: 1   Cumulative CPU: 5.86 sec   HDFS Read: 9792 HDFS Write: 83 SUCCESS
Total MapReduce CPU Time Spent: 5 seconds 860 msec
OK
Time taken: 53.74 seconds

5.3 查询统计结果

hive> select * from stat;
OK
2018-07-01	2	1
Time taken: 0.36 seconds, Fetched: 1 row(s)

6.查看统计的数据文件

7.把统计导入MYSQL数据库

智能文章系统实战-Hadoop海量数据统计(14)

发布于：2018-7-10 17:15　作者：admin　浏览：2612　

1.把hadoop添加到环境变量

#vi /etc/profile
export HADOOP_HOME=/usr/local/soft/Hadoop/hadoop
export PATH=${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin:$PATH
export CLASSPATH=$($HADOOP_HOME/bin/hadoop classpath):$CLASSPATH

#source /etc/profile

2.启动hadoop

#hdfs namenode -format
#start-all.sh 
#hadoop fs -mkdir -p HDFS_INPUT_PV_IP

3.查看案例日志

#cat /var/log/nginx/news.demo.com.access.log-20180701
192.168.100.1 - - [01/Jul/2018:15:59:48 +0800] "GET http://news.demo.com/h5.php HTTP/1.1" 200 3124 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" "-"
192.168.100.1 - - [01/Jul/2018:16:00:03 +0800] "GET http://news.demo.com/h5.php?action=show&id=128 HTTP/1.1" 200 1443 "http://news.demo.com/h5.php" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" "-"
192.168.100.1 - - [01/Jul/2018:16:00:22 +0800] "GET http://news.demo.com/h5.php HTTP/1.1" 200 3124 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" "-"
192.168.100.1 - - [01/Jul/2018:16:00:33 +0800] "GET http://news.demo.com/h5.php?action=show&id=89 HTTP/1.1" 200 6235 "http://news.demo.com/h5.php" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" "-"

4. 采集日志数据 (sh 脚本，hadoop 脚本，实际使用配置计划任务每天00:01分处理昨天的日志)

#vi hadoopLog.sh

#!/bin/sh
#昨天的日期
yesterday=$(date --date='1 days ago' +%Y%m%d)
#测试案例日志日期
yesterday="20180701"
#hadoop命令行上传文件 
hadoop fs -put /var/log/news.demo.com.access.log-${yesterday} HDFS_INPUT_PV_IP/${yesterday}.log


#sh hadoopLog.sh

5. Hadoop处理日志数据PHP之MAP

#vi /usr/local/soft/Hadoop/hadoop/demo/StatMap.php

<?php
error_reporting(0);
while (($line = fgets(STDIN)) !== false) 
{  
   if(stripos($line,'action=show')>0)
   {
   
		$words = preg_split('/(\s+)/', $line);    
		echo $words[0].chr(9).strtotime(str_replace('/',' ',substr($words[3],1,11))." ".substr($words[3],13)).chr(9).$words[6].PHP_EOL;
   }
}  
?>

6. Hadoop处理日志数据PHP之Reduce

#vi /usr/local/soft/Hadoop/hadoop/demo/StatReduce.php

<?php
error_reporting(0);
$fp=fopen('/tmp/pvlog.txt','w+');

$pvNum=0;
$ipNum=0;
$ipList=array();
while (($line = fgets(STDIN)) !== false) 
{  
	$pvNum=$pvNum+1;
	$tempArray=explode(chr(9),$line);
    $ip = trim($tempArray[0]);
	if(!in_array($ip,$ipList))
	{
		$ipList[]=$ip;
		$ipNum=$ipNum+1;
	}
	
	//把每行的详细数据记录文件中，用户HIVE统计和HBASE详细记录
	fwrite($fp,$line);
}

fclose($fp);


//把统计的插入MYSQL数据库
$yestoday=date("Y-m-d",time()-86400); //实际统计昨天的数据
$yestoday='2018-07-01'; //以2018-07-01的日志进行测试
$mysqli = new mysqli('localhost', 'root', '', 'article');
$sql="INSERT INTO stat SET stat_date='{$yestoday}',pv={$pvNum},ip={$ipNum}";
$mysqli->query($sql);
$mysqli->close();


echo "DATE=".$yestoday.PHP_EOL;
echo "PV=".$pvNum.PHP_EOL;
echo "IP=".$ipNum.PHP_EOL;


?>

7. Hadoop处理日志数据（流方式）

#hadoop jar /usr/local/soft/Hadoop/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.9.1.jar  -mapper /usr/local/soft/Hadoop/hadoop/demo/StatMap.php -reducer /usr/local/soft/Hadoop/hadoop/demo/StatReduce.php  -input HDFS_INPUT_PV_IP/* -output HDFS_OUTPUT_PV_IP

8.查看统计结果

9.查看数据库的统计结果

10.查看/tmp/pvlog.txt的清洗结果

11.重复运行处理数据，需要删除已经存在的输出目录

#hadoop fs -rm -r -f HDFS_OUTPUT_PV_IP

12.案例命令集合

#hadoop fs -mkidr -p HDFS_INPUT_PV_IP
#hadoop fs -put /var/log/nginx/news.demo.com.access.log-20180701 HDFS_INPUT_PV_IP
#hadoop jar /usr/local/soft/Hadoop/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.9.1.jar  -mapper /usr/local/soft/Hadoop/hadoop/demo/StatMap.php -reducer /usr/local/soft/Hadoop/hadoop/demo/StatReduce.php  -input HDFS_INPUT_PV_IP/* -output HDFS_OUTPUT_PV_IP
#hadoop fs -cat HDFS_OUTPUT_PV_IP/*
MariaDB [article]> select * from stat;

智能文章系统实战-数据统计(13)

发布于：2018-7-1 16:53　作者：admin　浏览：2929　

1. 数据文件

#cat /var/log/nginx/news.demo.com.access.log-20180701
192.168.100.1 - - [01/Jul/2018:15:59:48 +0800] "GET http://news.demo.com/h5.php HTTP/1.1" 200 3124 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" "-"
192.168.100.1 - - [01/Jul/2018:16:00:03 +0800] "GET http://news.demo.com/h5.php?action=show&id=128 HTTP/1.1" 200 1443 "http://news.demo.com/h5.php" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" "-"
192.168.100.1 - - [01/Jul/2018:16:00:22 +0800] "GET http://news.demo.com/h5.php HTTP/1.1" 200 3124 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" "-"
192.168.100.1 - - [01/Jul/2018:16:00:33 +0800] "GET http://news.demo.com/h5.php?action=show&id=89 HTTP/1.1" 200 6235 "http://news.demo.com/h5.php" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" "-"

2. 数据表结构

CREATE TABLE IF NOT EXISTS `stat` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `stat_date` varchar(30) NOT NULL DEFAULT '' COMMENT '统计日期',
  `pv` int(11) NOT NULL DEFAULT '0' COMMENT 'PV量',
  `ip` int(11) NOT NULL DEFAULT '0' COMMENT 'IP量',
  PRIMARY KEY (`id`),
  UNIQUE KEY `stat_date` (`stat_date`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='文章阅读统计' AUTO_INCREMENT=1 ;

3. 统计代码

<?php
header("Content-Type:text/html;charset=utf-8");
error_reporting(E_ALL & ~E_NOTICE);
date_default_timezone_set('PRC'); 


$pv=0;
$ip=0;
$ipList=array();

$yestoday=date("Y-m-d",time()-86400); //实际读取昨天的日志
$yestoday='2018-07-01'; //读取2018-07-01的日志，用来测试
$fileName=date('Ymd',strtotime($yestoday));
//读取日志文件
$logPath="/var/log/nginx/news.demo.com.access.log-{$fileName}";
if(file_exists($logPath))
{
	$fileContents=file_get_contents($logPath);
	$fileArray=explode("\n",$fileContents);
	foreach($fileArray as $str)
	{
		//过滤只有阅读文章内容的数据
		if(strpos($str,'action=show'))
		{
			
			$tempArray=explode(" ",$str);
			
			//统计PV
			$pv=$pv+1;
			
			//统计IP
			$userip=$tempArray[0];
			if(!in_array($userip,$ipList))
			{
				$ipList[]=$userip;
				$ip=$ip+1;
			}
		}
	}
}



//插入MYSQL数据库
$mysqli = new mysqli('localhost', 'root', '', 'article');
$sql="INSERT INTO stat SET stat_date='{$yestoday}',pv={$pv},ip={$ip}";
$mysqli->query($sql);
$mysqli->close();


?>

4. 统计结果

//统计结果
MariaDB [article]> select * from stat;
+----+------------+----+----+
| id | stat_date  | pv | ip |
+----+------------+----+----+
|  1 | 2018-07-01 |  2 |  1 |
+----+------------+----+----+
1 row in set (0.00 sec)

智能文章系统实战-人工智能机器学习之内容推荐(12)

发布于：2018-7-1 10:49　作者：admin　浏览：2532　

1. 安装环境

#pip3 install pymysql
#pip3 install jieba
#pip3 install numpy
#pip3 install scipy
#pip3 install sklearn
#pip3 install pandas

2.内容推荐代码代码

#!/usr/bin/env python
#-*- coding:utf-8 -*-


#安装环境
#pip3 install pymysql
#pip3 install jieba
#pip3 install numpy
#pip3 install scipy
#pip3 install sklearn
#pip3 install pandas


#引入库
import pandas as pd #分词
import pymysql.cursors  #数据库
import re #正则过滤
import jieba #分词
from sklearn.feature_extraction.text import TfidfVectorizer  #结构化表示--向量空间模型
from sklearn.metrics.pairwise import linear_kernel



#初始化内容和分类的列表
dataList=[]



#定义函数,HTML转化为分词后用空格分离的字符串
def htmlToWords(html):
	reObject = re.compile(r'<[^>]+>',re.S)
	#过滤HTML
	text = reObject.sub('',html)
	#过滤\r\n
	text = re.sub('\t|\n|\r','',text)
	#分词
	words=jieba.cut(text)
	#把分词数组组成字符串返回
	return " ".join(words)
	



# 连接MySQL数据库
connection = pymysql.connect(host='localhost', port=3306, user='root', password='', db='article', charset='utf8', cursorclass=pymysql.cursors.DictCursor)

# 通过cursor创建游标
cursor = connection.cursor()

# 执行数据查询
sql = "SELECT `id`, `title`,`content` FROM `article` order by id desc limit 200"
cursor.execute(sql)

#查询数据库多条数据
result = cursor.fetchall()
for data in result:
    
	#HTML转化为分词后用空格分离的字符串赋值给words，此处以 标题title 进行相似度计算，也可以 文章内容content 进行相似度计算.
	item={'id':data['id'],'words':htmlToWords(data['title'])}
	dataList.append(item)
	
	




#创建数据集
ds = pd.DataFrame(dataList)

#将文本数据转化成特征向量
tf = TfidfVectorizer(analyzer='word', min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(ds['words'])

#人工智能进行相似度计算
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
#print(cosine_similarities)



#相似结果列表
resultList={}

#每篇文章和其他文章的相似度
for idx, row in ds.iterrows():
	
	#排序倒序，取前 5 篇文章相似的
	similar_indices = cosine_similarities[idx].argsort()[:-6:-1]
	
	#输出每篇文章相似的文章ID和文章相似度
	similar_items = [(cosine_similarities[idx][i], ds['id'][i]) for i in similar_indices]
	
	#用字典存储每篇文章ID对应的相似度结果
	resultList[row['id']]=similar_items

#输出每篇文章ID对应的相似度结果
#print(resultList)


#数据和文章ID=14 标题相似的 文章标题
resultDs=resultList[14]
print("标题相似的结果: ",resultDs)
for row in resultDs:
	
	#输出相似度>0的文章
	if row[0]>0:
		# 执行数据查询
		sql = "SELECT `id`, `title` FROM `article` WHERE id='%d' LIMIT 1"%(row[1])
		cursor.execute(sql)
		data = cursor.fetchone()
		
		#打印结果
		print(data)
		print("相似度=",row[0])
	
# 关闭数据连接
connection.close()

3. 结果输出

[root@bogon python]# python3 recommend.py 
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.564 seconds.
Prefix dict has been built succesfully.
标题相似的结果:  [(1.0, 14), (0.23201380925542303, 67), (0.215961061528388, 11), (0.09344442103258274, 29), (0.0, 1)]
{'id': 14, 'title': '个税大利好！起征点调至每年6万 增加专项扣除'}
相似度= 1.0
{'id': 67, 'title': '个税起征点拟上调至每年6万 第7次修正百姓获益几何?'}
相似度= 0.23201380925542303
{'id': 11, 'title': '个税起征点提高利好买房？月供1万个税可降2000多元'}
相似度= 0.215961061528388
{'id': 29, 'title': '个税起征点拟提至每月5000元 月薪万元能省多少？'}
相似度= 0.09344442103258274
[root@bogon python]#

智能文章系统实战-人工智能机器学习预测文章分类(11)

发布于：2018-7-1 10:33　作者：admin　浏览：2942　

1.安装环境

#pip3 install pymysql
#pip3 install jieba
#pip3 install numpy
#pip3 install scipy
#pip3 install sklearn

2.机器学习预测分类

#!/usr/bin/env python
#-*- coding:utf-8 -*-

#安装环境
#pip3 install pymysql
#pip3 install jieba
#pip3 install numpy
#pip3 install scipy
#pip3 install sklearn


#引入库
import pymysql.cursors  #数据库
import re #正则过滤
import jieba #分词
from sklearn.feature_extraction.text import CountVectorizer  #结构化表示--向量空间模型
from sklearn.model_selection import train_test_split #把数据分成训练集和测试集
from sklearn.naive_bayes import MultinomialNB #朴素贝叶斯分类器


#建立对象
vecObject = CountVectorizer(analyzer='word', max_features=4000,  lowercase = False)
classifierObject = MultinomialNB()

#初始化内容和分类的列表
contentList=[]
categoryList=[]



#定义函数,HTML转化为分词后用空格分离的字符串
def htmlToWords(html):
	reObject = re.compile(r'<[^>]+>',re.S)
	#过滤HTML
	text = reObject.sub('',html)
	#过滤\r\n
	text = re.sub('\t|\n|\r','',text)
	#分词
	words=jieba.cut(text)
	#把分词数组组成字符串返回
	return " ".join(words)
	



# 连接MySQL数据库
connection = pymysql.connect(host='localhost', port=3306, user='root', password='', db='article', charset='utf8', cursorclass=pymysql.cursors.DictCursor)

# 通过cursor创建游标
cursor = connection.cursor()

# 执行数据查询
sql = "SELECT `id`, `title`,`content`,`category` FROM `article` order by id desc limit 100"
cursor.execute(sql)

#查询数据库多条数据
result = cursor.fetchall()
for data in result:
    
	#HTML转化为分词后用空格分离的字符串
	wordsStr=htmlToWords(data['content'])
	
	#添加内容
	contentList.append(wordsStr)
	categoryList.append(data['category'])
	
	

# 关闭数据连接
connection.close()

#把数据分成训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(contentList, categoryList, random_state=1)


#结构化表示--向量空间模型
vecObject.fit(x_train)

#人工智能训练数据
classifierObject.fit(vecObject.transform(x_train), y_train)

#测试 测试集，准确度
score=classifierObject.score(vecObject.transform(x_test), y_test)
print("准确度score=",score,"\n")




#新数据预测分类

#预测分类案例1
predictHTML='<p>人民币对美元中间价为6.5569，创2017年12月25日以来最弱水平。人民币贬值成为市场讨论的热点，在美元短暂升值下，新兴市场货币贬值问题也备受关注。然而，从货币本身的升值与贬值来看，货币贬值的收益率与促进性是正常的，反而货币升值的破坏与打击则是明显的。当前人民币贬值正在进行中，市场预期破7的舆论喧嚣而起。尽管笔者也预计过年内破7的概率存在，但此时伴随中国股市下跌局面，我们应该审慎面对这一问题。</p>' #新的文章内容HTML
predictWords=htmlToWords(predictHTML)
predictCategory=classifierObject.predict(vecObject.transform([predictWords]))
print("案例1分词后文本=",predictWords,"\n")
print("案例1预测文本类别=","".join(predictCategory),"\n\n\n")


#预测分类案例2
predictHTML='<p>25日在报道称，央视罕见播放了多枚“东风-10A”巡航导弹同时命中一栋大楼的画面，坚固的钢筋混凝土建筑在导弹的打击下，瞬间灰飞烟灭。这种一栋大楼瞬间被毁的恐怖画面，很可能是在预演一种教科书式的斩首行动，表明解放军具备了超远距离的精准打击能力</p>' #新的文章内容HTML
predictWords=htmlToWords(predictHTML)
predictCategory=classifierObject.predict(vecObject.transform([predictWords]))
print("案例2分词后文本=",predictWords,"\n")
print("案例2预测文本类别=","".join(predictCategory),"\n")

3.输出结果

[root@bogon python]# python3 predictCategory.py 
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.588 seconds.
Prefix dict has been built succesfully.
准确度score= 0.8571428571428571 

案例1分词后文本= 人民币 对 美元 中间价 为 6.5569 ， 创 2017 年 12 月 25 日 以来 最 弱 水平 。 人民币 贬值 成为 市场 讨论 的 热点 ， 在 美元 短暂 升值 下 ， 新兴 市场 货币贬值 问题 也 备受 关注 。 然而 ， 从 货币 本身 的 升值 与 贬值 来看 ， 货币贬值 的 收益率 与 促进性 是 正常 的 ， 反而 货币 升值 的 破坏 与 打击 则 是 明显 的 。 当前 人民币 贬值 正在 进行 中 ， 市场 预期 破 7 的 舆论 喧嚣 而 起 。 尽管 笔者 也 预计 过年 内破 7 的 概率 存在 ， 但 此时 伴随 中国 股市 下跌 局面 ， 我们 应该 审慎 面对 这一 问题 。 

案例1预测文本类别= 金融 



案例2分词后文本= 25 日 在 报道 称 ， 央视 罕见 播放 了 多枚 “ 东风 - 10A ” 巡航导弹 同时 命中 一栋 大楼 的 画面 ， 坚固 的 钢筋 混凝土 建筑 在 导弹 的 打击 下 ， 瞬间 灰飞烟灭 。 这种 一栋 大楼 瞬间 被 毁 的 恐怖 画面 ， 很 可能 是 在 预演 一种 教科书 式 的 斩首 行动 ， 表明 解放军 具备 了 超 远距离 的 精准 打击 能力 

案例2预测文本类别= 军事 

[root@bogon python]#