Skip to content

Commit 769879f

Browse files
committed
first commit
1 parent 993b3a0 commit 769879f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

72 files changed

+54292
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
__pycache__/
33
*.py[cod]
44
*$py.class
5+
.idea
56

67
# C extensions
78
*.so

capture/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
#coding=utf-8

capture/capture_scheduler.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#coding=utf-8
2+
3+
from capture.dbhelper import get_db
4+
from capture.site_capture import SiteCapture
5+
from datetime import datetime
6+
import traceback
7+
from apscheduler.scheduler import Scheduler
8+
9+
def init_scheduler():
10+
scheduler=Scheduler()
11+
scheduler.start()
12+
scheduler.add_interval_job(capture_job, minutes=1)
13+
14+
def capture_job():
15+
with get_db() as db:
16+
site_configs= db.query('''select id,site_url,article_limit,capture_class,site_name from capture_site
17+
where isenabled=1 and isdeleted=0 and
18+
now()>date_add(capture_date,interval interval_minute minute)''')
19+
20+
for config in site_configs:
21+
try:
22+
capture = SiteCapture(config)
23+
capture.crawl()
24+
except :
25+
print traceback.print_exc()
26+
with get_db() as db:
27+
db.execute('update capture_site set failed_date= %s where id= %s',datetime.now(),config.id)
28+
finally:
29+
with get_db() as db:
30+
db.execute('update capture_site set capture_date= %s where id= %s',datetime.now(),config.id)
31+
32+
33+
if __name__=='__main__':
34+
init_scheduler()

capture/dbhelper.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#coding=utf-8
2+
3+
from capture.torndb import Connection
4+
5+
DB_CONFIG = {
6+
'db': 'cinfo',
7+
'host': 'localhost',
8+
'user': 'lcomplete',
9+
'pass': '123abc456'
10+
}
11+
12+
def get_db():
13+
db = Connection(
14+
host=DB_CONFIG['host'],
15+
user=DB_CONFIG['user'],
16+
password=DB_CONFIG['pass'],
17+
database=DB_CONFIG['db']
18+
19+
# time_zone="+8:00"
20+
)
21+
return db

capture/htmlhelper.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
#coding=utf-8
2+
3+
import urllib2,re
4+
from lxml import etree
5+
from urlparse import urlparse,urljoin
6+
from HTMLParser import HTMLParser
7+
8+
def get_page_resp(url):
9+
req=urllib2.Request(url)
10+
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36')
11+
result = urllib2.urlopen(req).read()
12+
return result
13+
14+
15+
def get_article_links(html,url,limit):
16+
doc= etree.HTML(html)
17+
a_elements= doc.xpath('//a')
18+
links=[]
19+
for a in a_elements:
20+
text = get_innertext(a)
21+
if isarticle_link(a,text):
22+
href=a.attrib.get("href",'')
23+
links.append({
24+
"url": href if href.find('http')==0 else urljoin(url,href),
25+
"text": text
26+
})
27+
if 0 < limit <= len(links):
28+
break
29+
return links
30+
31+
32+
def get_innertext(a):
33+
text= a.xpath('string()')
34+
return text
35+
36+
37+
def isarticle_link(a,text):
38+
href=a.attrib.get("href",'')
39+
if text==None or href==None:
40+
return False
41+
charlen= width_len(text)
42+
if charlen<10 or href.find('#')==0 or href.find('javascript:')==0:
43+
return False
44+
path = urlparse(href).path
45+
if path=='' or path=="/":
46+
return False
47+
48+
base = 0.5
49+
if path.find('?')>0:
50+
base += 0.1 if re.match(r'\?.*id.*',path,re.I) else -0.1
51+
elif re.match(r'^/[^/]+/?$',path):
52+
base-=0.3
53+
if a.attrib.get('title','') !='':
54+
base+=0.2
55+
if(text.endswith('...')):
56+
base+=0.4
57+
if re.match(r'/[^/]*-[^/]*/?$',path):
58+
base+=0.1
59+
if re.match(r'\d{4,}',path):
60+
base+=0.1
61+
if re.match(r'^[a-z]+$',text,re.I):
62+
base-=0.4
63+
positive_chars = ['”.*“','(.*)','(.*)','#.*#','[.*]','【.*】',':',':','——',',',',',r'\?',
64+
'?','《.*》','!','!',r'\d+\.html?']
65+
for pos_char in positive_chars:
66+
if re.match(pos_char,text,re.I):
67+
base+=0.2
68+
positive_links = ['article','detail','/p/']
69+
for pos_link in positive_links:
70+
if re.match(pos_link,path,re.I):
71+
base+=0.2
72+
base += charlen-10 / 26.
73+
return base>=1
74+
75+
76+
def width_len(str):
77+
'''计算字符串长度 两个ascii字符算一个字符'''
78+
length = len(str)
79+
utf8_length = len(str.encode('utf-8'))
80+
length = (utf8_length - length)/2 + length
81+
return length / 2
82+
83+
def strip_tags(html):
84+
"""
85+
Python中过滤HTML标签的函数
86+
>>> str_text=strip_tags("<font color=red>hello</font>")
87+
>>> print str_text
88+
hello
89+
"""
90+
html = html.strip()
91+
html = html.strip("\n")
92+
result = []
93+
parser = HTMLParser()
94+
parser.handle_data = result.append
95+
parser.feed(html)
96+
parser.close()
97+
html= ''.join(result)
98+
html= re.sub(r'\s+',' ',html)
99+
html=re.sub(r'\n+','\n',html)
100+
return html
101+
102+
def extract_brief(text,min_length,max_length):
103+
paragraphs = text.split('\n')
104+
result= []
105+
length = 0
106+
for pa in paragraphs:
107+
cur_len=len(pa)
108+
to_len=cur_len+length
109+
if to_len<min_length:
110+
result.append(pa)
111+
elif to_len>max_length:
112+
result.append(pa[0:max_length-length])
113+
else:
114+
result.append(pa)
115+
break
116+
length=to_len
117+
return '\n'.join(result)

capture/install.sql

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
CREATE SCHEMA `cinfo` DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci ;
2+
3+
CREATE TABLE `cinfo`.`capture_site` (
4+
`id` INT NOT NULL ,
5+
`site_url` VARCHAR(128) NOT NULL ,
6+
`article_limit` INT NOT NULL ,
7+
`interval_minute` INT NOT NULL ,
8+
`isenabled` BIT NOT NULL ,
9+
`isdeleted` BIT NOT NULL ,
10+
`capture_class` VARCHAR(45) NOT NULL COMMENT 'python抓取器class' ,
11+
PRIMARY KEY (`id`) )
12+
COMMENT = '抓取站点的配置';
13+
14+
ALTER TABLE `cinfo`.`capture_site` CHANGE COLUMN `id` `id` INT(11) NOT NULL AUTO_INCREMENT ;
15+
16+
CREATE TABLE `cinfo`.`capture_article` (
17+
`id` INT NOT NULL AUTO_INCREMENT ,
18+
`short_title` VARCHAR(128) NOT NULL ,
19+
`summary` TEXT NOT NULL ,
20+
`raw_url` VARCHAR(256) NOT NULL ,
21+
`site_id` INT NOT NULL ,
22+
`create_date` DATETIME NOT NULL ,
23+
`update_date` DATETIME NOT NULL ,
24+
PRIMARY KEY (`id`) );
25+
26+
ALTER TABLE `cinfo`.`capture_site` ADD COLUMN `capture_date` DATETIME NOT NULL AFTER `capture_class` , ADD COLUMN `failed_date` DATETIME NOT NULL AFTER `capture_date` ;
27+
28+
update capture_site set capture_date='2000-1-1',failed_date='2000-1-1' where id=1;
29+
30+
ALTER TABLE `cinfo`.`capture_article` ADD COLUMN `isshow` BIT NOT NULL AFTER `update_date` ;
31+
32+
ALTER TABLE `cinfo`.`capture_article` ADD COLUMN `brief` TEXT NOT NULL AFTER `summary` ;
33+
34+
ALTER TABLE `cinfo`.`capture_site` ADD COLUMN `site_name` VARCHAR(45) NOT NULL AFTER `site_url` ;
35+
36+
update capture_site set site_name='开源中国' where id=1;
37+
38+
insert into capture_site values(2,'http://news.dbanotes.net/','Startup News',
39+
20,30,1,0,'','2000-1-1','2000-1-1');
40+
41+
42+
#########################

capture/site_capture.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
#coding=utf-8
2+
3+
from capture.dbhelper import get_db
4+
from readability import Document
5+
from capture.htmlhelper import get_page_resp, get_article_links, strip_tags, extract_brief
6+
from datetime import datetime, timedelta
7+
import logging, traceback
8+
from newspaper import Config,Article
9+
10+
11+
class SiteCapture(object):
12+
def __init__(self, config):
13+
self.config = config
14+
self.url = config['site_url']
15+
self.site_id = config['id']
16+
self.limit = config['article_limit']
17+
self.site_name = config['site_name']
18+
19+
def crawl(self):
20+
links = self.get_article_links()
21+
links.reverse()
22+
for link in links:
23+
article = self.capture_article(link['url'], link['text'])
24+
if article:
25+
self.__save_article(article)
26+
27+
def get_article_links(self):
28+
html = get_page_resp(self.url)
29+
links = get_article_links(html, self.url, self.limit)
30+
return links
31+
32+
def capture_article(self, link_url, link_text):
33+
try:
34+
article = self.get_exist_article(link_url)
35+
if not article:
36+
article = {
37+
"raw_url": link_url,
38+
"site_id": self.site_id,
39+
"create_date": datetime.now(),
40+
"isshow": True,
41+
"site_name": self.site_name
42+
}
43+
elif article.update_date + timedelta(days=1) > datetime.now():
44+
return
45+
46+
article["update_date"] = datetime.now()
47+
html = get_page_resp(link_url)
48+
49+
# readability
50+
# article["summary"] = Document(html, url=link_url).summary(html_partial=True)
51+
# plain_text = strip_tags(article["summary"])
52+
# title = Document(html).short_title()
53+
# if not title or title == '':
54+
# title = link_text
55+
# article["short_title"] = title
56+
57+
#newspaper
58+
config = Config()
59+
config.keep_article_html = True
60+
news= Article(link_url, config=config, memoize_articles=False, language='zh')
61+
news.download(html)
62+
news.parse()
63+
article["short_title"] = news.title
64+
article["summary"] = news.article_html
65+
plain_text = strip_tags(news.meta_description)
66+
67+
if len(plain_text) >= 200:
68+
article["brief"] = extract_brief(plain_text, 150, 200)
69+
else:
70+
article["brief"] = plain_text
71+
return article
72+
except:
73+
logging.error(u"抓取 %s 失败, traceback: %s", link_url, traceback.print_exc())
74+
75+
def get_exist_article(self, link_url):
76+
with get_db() as db:
77+
article = db.get('select * from capture_article where raw_url= %s and site_id= %s', link_url, self.site_id)
78+
return article
79+
80+
def __save_article(self, article):
81+
with get_db() as db:
82+
if article.has_key("id"):
83+
db.execute('''update capture_article set short_title= %s, summary= %s, update_date = %s,
84+
brief= %s, isshow= %s where id= %s ''',
85+
article["short_title"], article["summary"], article["update_date"], article["brief"],
86+
article["id"]
87+
)
88+
else:
89+
db.execute('''insert into capture_article
90+
(short_title,summary,raw_url,site_id,create_date,update_date,isshow,brief,site_name)
91+
values(%s,%s,%s,%s,%s,%s,%s,%s,%s)''',
92+
article["short_title"], article["summary"], article["raw_url"], article["site_id"],
93+
article["create_date"], article["update_date"], article["isshow"], article["brief"],
94+
article["site_name"]
95+
)
96+
97+

0 commit comments

Comments
 (0)