首页 技术 正文
技术 2022年11月10日
0 收藏 544 点赞 2,587 浏览 10629 个字
# _*_ coding: utf- _*_"""
思路:
.列表页使用phantomjs模拟点击.每个链接只抓取第一页9-10条内容,按照标题去重
.布置定时任务,每天8点执行一次
"""import MySQLdb
import redis
import sys
import os
import re
import urllib
import requests
import time
import hashlib
import traceback
import urlparse
import random
import signal
# import multiprocessing
import matplotlib
matplotlib.use("Agg")
import shutil
import socket #图片下载延迟的
socket.setdefaulttimeout()
import multiprocessing
from config import IConfig
from video_list import ydzx_url_list
from bs4 import BeautifulSoup
from upload_images import UploadFile
from moviepy.editor import VideoFileClip
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilitiesreload(sys)
sys.setdefaultencoding('utf-8')class WxpnVideo(multiprocessing.Process): def __init__(self):
self.redisConf = IConfig.load('resource.redis')
self.redisServer = redis.Redis(host=self.redisConf['host'], port=self.redisConf['port'], db=self.redisConf['db'], password=self.redisConf['passwd']) self.dbConfig = IConfig.load('resource.mysql')
self.conn = MySQLdb.connect(
user = self.dbConfig['user'],
passwd = self.dbConfig['password'],
db = self.dbConfig['dbname'],
host = self.dbConfig['host'],
charset = "utf8",
use_unicode = True) self.conn.ping(True)
self.cursor = self.conn.cursor() self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
'Host': 'www.yidianzixun.com',
'X-Requested-With': 'XMLHttpRequest',
'Connection': 'keep-alive',
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
} self.domain = IConfig.load('resource.domain')
self.apiConf = IConfig.load('resource.apiurl') self.key_video_list = 'wxpn:video:list'
self.key_title = 'wxpn:video:title' self.storeConfig = IConfig.load('resource.store')
self.thumb_path = self.storeConfig['images_path'] self.ossConf = IConfig.load('resource.oss')
self.key_id = self.ossConf['access_key_id']
self.key_secret = self.ossConf['access_key_secret']
self.endponit = self.ossConf['endponit'] self.img_upload = UploadFile()
self.auth = self.img_upload.auth_oss(self.key_id, self.key_secret) self.videoConf = IConfig.load('resource.apiurl')
self.video_publish = self.videoConf['video_publish_api']
self.ydzx_page_api = self.videoConf['ydzx_page_api'] self.start_time = int(time.time())
multiprocessing.Process.__init__(self) def store_video_list_redis(self, video_list): if video_list:
for per_list in video_list:
if not self.redisServer.sismember(self.key_video_list, per_list):
self.redisServer.sadd(self.key_video_list, per_list)
else:
return False def get_video_para(self):
while True:
if self.redisServer.scard(self.key_video_list) == :
break link = self.redisServer.spop(self.key_video_list)
print(link)
# url = self.ydzx_page_api + link # try:
# res = requests.get(url=url, timeout=)
# except Exception as e:
# print('连接失败')
# print(res.status_code) dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
)
try:
driver = webdriver.PhantomJS(desired_capabilities=dcap, executable_path='/usr/local/phantomjs/bin/phantomjs') # driver.set_page_load_timeout()
# driver.set_script_timeout() time.sleep(random.randrange(, ))
driver.get(link)
time.sleep(random.randrange(, )) text = driver.page_source driver.service.process.send_signal(signal.SIGTERM)
driver.quit() except Exception as e: print(traceback.format_exc())
continue # if res.status_code == :
soup = BeautifulSoup(text, 'lxml')
title_list = soup.select('div.channel-news div.doc-title')
itemid_list = soup.select('div.channel-news a.style-content-middle') if title_list and itemid_list: try:
for num, title in enumerate(title_list):
m = hashlib.md5()
m.update(str(title.text).strip())
psw = m.hexdigest() print(title.text)
itemid = itemid_list[num]['data-docid'] if not self.redisServer.sismember(self.key_title, psw): yield psw, itemid
except Exception as e:
print(traceback.format_exc())
continue else:
print('一点资讯视频列表{0}页链接请求失败,请及时查看原因'.format(link)) # self.form_data['content'] = '【创业黑马】预警:一点资讯视频列表{0}页链接请求失败,请及时查看原因'.format(link)
# res = requests.post(self.msg_api, data=self.form_data) def time_cycle(self,origin_time):
now = time.time() try:
if origin_time == '昨天':
published = int(now) - *
elif '天' in origin_time:
day_one = re.compile('(.*?)天')
published = int(now)-int(day_one.findall(origin_time)[])**
elif '小时' in origin_time:
hour_one = re.compile('(.*?)小时')
published = int(now)-int(hour_one.findall(origin_time)[])**
elif '分' in origin_time:
min_one = re.compile('(.*?)分')
published = int(now)-int(min_one.findall(origin_time)[])*
elif '月' in origin_time:
month_one = re.compile('(.*?)个月')
published = int(now)-int(month_one.findall(origin_time)[])***
else:
timeArray = time.strptime(origin_time, "%Y.%m.%d")
published = int(time.mktime(timeArray))
return published
except Exception as e:
print(traceback.format_exc()) def download_video(self, psw, itemid):
now = int(time.time()) url = 'http://www.yidianzixun.com/article/' + itemid
print(url)
self.headers['Referer'] = url
try:
res = requests.get(url=url, headers=self.headers, timeout=)
print(res.status_code)
except Exception as e:
print('小链接连接失败') if res.status_code == :
soup = BeautifulSoup(res.text, 'lxml') title = soup.select('div.left-wrapper > h2')[].text try:
video_src = soup.select('div.video-wrapper > video')[]['src']
except Exception as e:
print('此篇为文章,不是视频') thumb_src = soup.select('div.video-wrapper > video')[]['poster'] try:
source = soup.select('body.page-article .left-wrapper > .meta > a')[].text
except Exception as e:
source = soup.select('body.page-article .left-wrapper > .meta > span')[].text
source_re = re.sub('来源:', '', str(source))
source = source_re publishtime = soup.select('body.page-article .left-wrapper > .meta > span')[].text try:
timestamp = self.time_cycle(str(publishtime))
except Exception as e:
timestamp = now img_url_parts = urlparse.urlparse(thumb_src)
img_url_query = urlparse.parse_qs(img_url_parts.query,True) if img_url_query.has_key('wx_fmt'):
ext_name = '.' + img_url_query['wx_fmt'][]
else:
ext_name = '.png' thumb_p = self.thumb_path + 'video/thumb'
if not os.path.exists(thumb_p):
os.mkdir(thumb_p) img_down_local_path = thumb_p + '/' + psw[:] + ext_name
urllib.urlretrieve(thumb_src, img_down_local_path)
file_name = psw[:] + ext_name if os.path.exists(img_down_local_path):
images_path = self.ossConf['video_thumb_path']
status = self.img_upload.upload_to_oss(self.auth, self.endponit, images_path, file_name, img_down_local_path) thumb_src = self.domain['img_url_oss'] + 'Cmstop/ydzx/' + file_name
m = hashlib.md5()
m.update(str(thumb_src))
psw_thumb = m.hexdigest() try:
delay_re = re.compile('"duration":(\d+)')
playtime = delay_re.findall(str(res.text))[]
except Exception as e:
print(traceback.format_exc())
playtime = None video_path = self.thumb_path + 'video/' + str(video_src).split('/')[-] video_res = requests.get(video_src).content
with open(video_path, 'wb') as f:
f.write(video_res) try:
clip = VideoFileClip(video_path)
print(clip.duration)
except Exception as e:
print(traceback.format_exc())
return False # with open(video_path, 'r') as f:
# length = len(f.read())
# if length < :
# return False video_name = str(video_src).split('/')[-][:]
if os.path.exists(video_path):
images_path = self.ossConf['video_path']
status = self.img_upload.upload_to_oss(self.auth, self.endponit, images_path, video_name, video_path) print('')
if status != 'success':
return False print('视频上传成功')
video_link = self.domain['img_url_oss'] + 'Cmstop/video/ydzx/' + video_name try:
sql = "insert into cmstop_comment_topic(title, description, thumb, created, url_md5, url) values(%s, '', %s, %s, %s, '')"
params = (title, thumb_src, now, psw_thumb)
self.cursor.execute(sql, params)
self.conn.commit() topicid = self.cursor.lastrowid except Exception as e:
print(traceback.format_exc())
self.conn.rollback() sourceid = self.get_article_sourceid(source) try:
result = self.cursor.execute("""
insert into cmstop_content(topicid, sourceid, catid, modelid, title, subtitle, source_title, source_link, weight, status, created, score, published, thumb, createdby)
values(%s, %s, %s, %s, %s, %s, %s, %s, , %s, %s, %s, %s, %s, %s)
""", (topicid, sourceid, 47, 4, title, None, title, '', 3, now, 0, timestamp, thumb_src, 0))
self.conn.commit() lastrowid = self.cursor.lastrowid
except Exception as e:
print(traceback.format_exc())
self.conn.rollback() video_id = str(video_src).split('/')[-][:-]
try:
sql = "insert into cmstop_video(contentid, video, playtime, author, video_id, aid) values(%s, %s, %s, %s, %s, %s)"
print(sql)
params = (lastrowid, video_link, playtime, source, video_id, )
self.cursor.execute(sql, params)
self.conn.commit() except Exception as e:
print(traceback.format_exc())
self.conn.rollback() self.redisServer.sadd(self.key_title, psw) api_url = self.video_publish + str(lastrowid)
try:
resp = urllib.urlopen(api_url)
result = resp.read()
except:
print 'connect failed' else:
print('一点资讯视频主链接请求失败,请及时查看原因') # self.form_data['content'] = '【创业黑马】预警:一点资讯视频主链接请求失败,请及时查看原因'
# res = requests.post(self.msg_api, data=self.form_data) def get_article_sourceid(self, source, medias = []):
source = source.strip()
sourceid = """
print source
print set([source.encode('utf-8')])
print medias
""" result = self.cursor.execute('select `sourceid`, `name`, `has_signed_contract` from `cmstop_source` where `name`="' + source + '"')
has_signed_contract = if medias and (set([source.encode('utf-8')]) & medias):
has_signed_contract = if result:
data = self.cursor.fetchone()
sourceid = data[] if data[] != has_signed_contract:
try:
result = self.cursor.execute("""
update `cmstop_source` set `has_signed_contract`=%s where sourceid=%s
""", (has_signed_contract, sourceid))
self.conn.commit()
except:
self.conn.rollback()
else:
try:
result = self.cursor.execute("""
insert into `cmstop_source`(`name`, `logo`, `url`, `initial`, `has_signed_contract`)
values(%s, %s, %s, %s, %s)
""", (source, '', '', '', has_signed_contract))
self.conn.commit()
sourceid = self.cursor.lastrowid
except:
self.conn.rollback() return sourceid def run(self):
os.system('pkill phantomjs') lockConf = IConfig.load('resource.lock')
lock_file = lockConf['lock_path_ydzx'] if os.path.exists(lock_file):
print('lock file exists')
return False
os.system(r'touch %s '% lock_file) self.store_video_list_redis(ydzx_url_list)
get_video_para = self.get_video_para() for psw, itemid in get_video_para: print(psw) stop_time = int(time.time())
balance_time = stop_time - self.start_time if balance_time >= : #运行时间为3个小时 self.del_file(self.thumb_path + 'video')
os.system(r'rm -rf %s' % lock_file)
os._exit() try:
self.download_video(psw=psw, itemid=itemid) time.sleep(random.uniform(, )) os.system('pkill ffmpeg-osx-v3.2.4') except Exception as e:
print(traceback.format_exc())
continue self.del_file(self.thumb_path + 'video')
os.system(r'rm -rf %s' % lock_file) def video_publish(self):
sql = 'select contentid from cmstop_video where contentid<=3528920 and contentid>=3430851'
self.cursor.execute(sql)
data = self.cursor.fetchall() for num in data:
api_url = self.video_publish + str(num[])
try:
resp = urllib.urlopen(api_url)
result = resp.read()
except:
print 'connect failed' def del_file(self, path):
os.chdir(path) #进入要清空的目录
ds = list(os.listdir(path)) #获得该目录下所有文件或文件夹列表皮 for d in ds: #遍历该列表 if os.path.isfile(d): #如果列表项是文件
os.remove(d) #直接删除
else: #如果不会文件
shutil.rmtree(d) #也直接删除if __name__ == '__main__': video_one = WxpnVideo()
# video_two = WxpnVideo() video_one.start()
# video_two.start() video_one.join()
# video_two.join()
相关推荐
python开发_常用的python模块及安装方法
adodb:我们领导推荐的数据库连接组件bsddb3:BerkeleyDB的连接组件Cheetah-1.0:我比较喜欢这个版本的cheeta…
日期:2022-11-24 点赞:878 阅读:9,493
Educational Codeforces Round 11 C. Hard Process 二分
C. Hard Process题目连接:http://www.codeforces.com/contest/660/problem/CDes…
日期:2022-11-24 点赞:807 阅读:5,907
下载Ubuntn 17.04 内核源代码
zengkefu@server1:/usr/src$ uname -aLinux server1 4.10.0-19-generic #21…
日期:2022-11-24 点赞:569 阅读:6,740
可用Active Desktop Calendar V7.86 注册码序列号
可用Active Desktop Calendar V7.86 注册码序列号Name: www.greendown.cn Code: &nb…
日期:2022-11-24 点赞:733 阅读:6,495
Android调用系统相机、自定义相机、处理大图片
Android调用系统相机和自定义相机实例本博文主要是介绍了android上使用相机进行拍照并显示的两种方式,并且由于涉及到要把拍到的照片显…
日期:2022-11-24 点赞:512 阅读:8,133
Struts的使用
一、Struts2的获取  Struts的官方网站为:http://struts.apache.org/  下载完Struts2的jar包,…
日期:2022-11-24 点赞:671 阅读:5,297