lcomplete
diff --git a/‎.gitignore
Lines changed: 1 addition & 0 deletions b/‎.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎capture/__init__.py
Lines changed: 1 addition & 0 deletions b/‎capture/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎capture/capture_scheduler.py
Lines changed: 34 additions & 0 deletions b/‎capture/capture_scheduler.py
Lines changed: 34 additions & 0 deletions
diff --git a/‎capture/dbhelper.py
Lines changed: 21 additions & 0 deletions b/‎capture/dbhelper.py
Lines changed: 21 additions & 0 deletions
diff --git a/‎capture/htmlhelper.py
Lines changed: 117 additions & 0 deletions b/‎capture/htmlhelper.py
Lines changed: 117 additions & 0 deletions
diff --git a/‎capture/install.sql
Lines changed: 42 additions & 0 deletions b/‎capture/install.sql
Lines changed: 42 additions & 0 deletions
diff --git a/‎capture/site_capture.py
Lines changed: 97 additions & 0 deletions b/‎capture/site_capture.py
Lines changed: 97 additions & 0 deletions
@@ -2,6 +2,7 @@
 __pycache__/
 *.py[cod]
 *$py.class
+.idea
 
 # C extensions
 *.so
 
@@ -0,0 +1 @@
+#coding=utf-8
@@ -0,0 +1,34 @@
+#coding=utf-8
+
+from capture.dbhelper import get_db
+from capture.site_capture import SiteCapture
+from datetime import datetime
+import traceback
+from apscheduler.scheduler import Scheduler
+
+def init_scheduler():
+    scheduler=Scheduler()
+    scheduler.start()
+    scheduler.add_interval_job(capture_job, minutes=1)
+
+def capture_job():
+    with get_db() as db:
+        site_configs= db.query('''select id,site_url,article_limit,capture_class,site_name from capture_site
+                where isenabled=1 and isdeleted=0 and
+                now()>date_add(capture_date,interval interval_minute minute)''')
+
+    for config in site_configs:
+        try:
+            capture = SiteCapture(config)
+            capture.crawl()
+        except :
+            print traceback.print_exc()
+            with get_db() as db:
+                db.execute('update capture_site set failed_date= %s where id= %s',datetime.now(),config.id)
+        finally:
+            with get_db() as db:
+                db.execute('update capture_site set capture_date= %s where id= %s',datetime.now(),config.id)
+
+
+if __name__=='__main__':
+    init_scheduler()
@@ -0,0 +1,21 @@
+#coding=utf-8
+
+from capture.torndb import Connection
+
+DB_CONFIG = {
+    'db': 'cinfo',
+    'host': 'localhost',
+    'user': 'lcomplete',
+    'pass': '123abc456'
+}
+
+def get_db():
+    db = Connection(
+        host=DB_CONFIG['host'],
+        user=DB_CONFIG['user'],
+        password=DB_CONFIG['pass'],
+        database=DB_CONFIG['db']
+
+        # time_zone="+8:00"
+    )
+    return db
@@ -0,0 +1,117 @@
+#coding=utf-8
+
+import urllib2,re
+from lxml import etree
+from urlparse import urlparse,urljoin
+from HTMLParser import HTMLParser
+
+def get_page_resp(url):
+    req=urllib2.Request(url)
+    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36')
+    result = urllib2.urlopen(req).read()
+    return result
+
+
+def get_article_links(html,url,limit):
+    doc= etree.HTML(html)
+    a_elements= doc.xpath('//a')
+    links=[]
+    for a in a_elements:
+        text = get_innertext(a)
+        if isarticle_link(a,text):
+            href=a.attrib.get("href",'')
+            links.append({
+                "url": href if href.find('http')==0 else urljoin(url,href),
+                "text": text
+            })
+            if 0 < limit <= len(links):
+                break
+    return links
+
+
+def get_innertext(a):
+    text= a.xpath('string()')
+    return text
+
+
+def isarticle_link(a,text):
+    href=a.attrib.get("href",'')
+    if text==None or href==None:
+        return False
+    charlen= width_len(text)
+    if charlen<10 or href.find('#')==0 or href.find('javascript:')==0:
+        return False
+    path = urlparse(href).path
+    if path=='' or path=="/":
+        return False
+
+    base = 0.5
+    if path.find('?')>0:
+        base += 0.1 if re.match(r'\?.*id.*',path,re.I) else -0.1
+    elif re.match(r'^/[^/]+/?$',path):
+        base-=0.3
+    if a.attrib.get('title','') !='':
+        base+=0.2
+    if(text.endswith('...')):
+        base+=0.4
+    if re.match(r'/[^/]*-[^/]*/?$',path):
+        base+=0.1
+    if re.match(r'\d{4,}',path):
+        base+=0.1
+    if re.match(r'^[a-z]+$',text,re.I):
+        base-=0.4
+    positive_chars = ['”.*“','（.*）','(.*)','#.*#','[.*]','【.*】','：',':','——','，',',',r'\?',
+                      '？','《.*》','！','!',r'\d+\.html?']
+    for pos_char in positive_chars:
+        if re.match(pos_char,text,re.I):
+            base+=0.2
+    positive_links = ['article','detail','/p/']
+    for pos_link in positive_links:
+        if re.match(pos_link,path,re.I):
+            base+=0.2
+    base += charlen-10 / 26.
+    return base>=1
+
+
+def width_len(str):
+    '''计算字符串长度 两个ascii字符算一个字符'''
+    length = len(str)
+    utf8_length = len(str.encode('utf-8'))
+    length = (utf8_length - length)/2 + length
+    return length / 2
+
+def strip_tags(html):
+    """
+    Python中过滤HTML标签的函数
+    >>> str_text=strip_tags("<font color=red>hello</font>")
+    >>> print str_text
+    hello
+    """
+    html = html.strip()
+    html = html.strip("\n")
+    result = []
+    parser = HTMLParser()
+    parser.handle_data = result.append
+    parser.feed(html)
+    parser.close()
+    html= ''.join(result)
+    html= re.sub(r'\s+',' ',html)
+    html=re.sub(r'\n+','\n',html)
+    return html
+
+def extract_brief(text,min_length,max_length):
+    paragraphs = text.split('\n')
+    result= []
+    length = 0
+    for pa in paragraphs:
+        cur_len=len(pa)
+        to_len=cur_len+length
+        if to_len<min_length:
+            result.append(pa)
+        elif to_len>max_length:
+            result.append(pa[0:max_length-length])
+        else:
+            result.append(pa)
+            break
+        length=to_len
+    return '\n'.join(result)
@@ -0,0 +1,42 @@
+CREATE SCHEMA `cinfo` DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci ;
+
+CREATE  TABLE `cinfo`.`capture_site` (
+  `id` INT NOT NULL ,
+  `site_url` VARCHAR(128) NOT NULL ,
+  `article_limit` INT NOT NULL ,
+  `interval_minute` INT NOT NULL ,
+  `isenabled` BIT NOT NULL ,
+  `isdeleted` BIT NOT NULL ,
+  `capture_class` VARCHAR(45) NOT NULL COMMENT 'python抓取器class' ,
+  PRIMARY KEY (`id`) )
+COMMENT = '抓取站点的配置';
+
+ALTER TABLE `cinfo`.`capture_site` CHANGE COLUMN `id` `id` INT(11) NOT NULL AUTO_INCREMENT  ;
+
+CREATE  TABLE `cinfo`.`capture_article` (
+  `id` INT NOT NULL AUTO_INCREMENT ,
+  `short_title` VARCHAR(128) NOT NULL ,
+  `summary` TEXT NOT NULL ,
+  `raw_url` VARCHAR(256) NOT NULL ,
+  `site_id` INT NOT NULL ,
+  `create_date` DATETIME NOT NULL ,
+  `update_date` DATETIME NOT NULL ,
+  PRIMARY KEY (`id`) );
+
+ALTER TABLE `cinfo`.`capture_site` ADD COLUMN `capture_date` DATETIME NOT NULL  AFTER `capture_class` , ADD COLUMN `failed_date` DATETIME NOT NULL  AFTER `capture_date` ;
+
+update capture_site set capture_date='2000-1-1',failed_date='2000-1-1' where id=1;
+
+ALTER TABLE `cinfo`.`capture_article` ADD COLUMN `isshow` BIT NOT NULL  AFTER `update_date` ;
+
+ALTER TABLE `cinfo`.`capture_article` ADD COLUMN `brief` TEXT NOT NULL  AFTER `summary` ;
+
+ALTER TABLE `cinfo`.`capture_site` ADD COLUMN `site_name` VARCHAR(45) NOT NULL  AFTER `site_url` ;
+
+update capture_site set site_name='开源中国' where id=1;
+
+insert into capture_site values(2,'http://news.dbanotes.net/','Startup News',
+20,30,1,0,'','2000-1-1','2000-1-1');
+
+
+#########################
@@ -0,0 +1,97 @@
+#coding=utf-8
+
+from capture.dbhelper import get_db
+from readability import Document
+from capture.htmlhelper import get_page_resp, get_article_links, strip_tags, extract_brief
+from datetime import datetime, timedelta
+import logging, traceback
+from newspaper import Config,Article
+
+
+class SiteCapture(object):
+    def __init__(self, config):
+        self.config = config
+        self.url = config['site_url']
+        self.site_id = config['id']
+        self.limit = config['article_limit']
+        self.site_name = config['site_name']
+
+    def crawl(self):
+        links = self.get_article_links()
+        links.reverse()
+        for link in links:
+            article = self.capture_article(link['url'], link['text'])
+            if article:
+                self.__save_article(article)
+
+    def get_article_links(self):
+        html = get_page_resp(self.url)
+        links = get_article_links(html, self.url, self.limit)
+        return links
+
+    def capture_article(self, link_url, link_text):
+        try:
+            article = self.get_exist_article(link_url)
+            if not article:
+                article = {
+                    "raw_url": link_url,
+                    "site_id": self.site_id,
+                    "create_date": datetime.now(),
+                    "isshow": True,
+                    "site_name": self.site_name
+                }
+            elif article.update_date + timedelta(days=1) > datetime.now():
+                return
+
+            article["update_date"] = datetime.now()
+            html = get_page_resp(link_url)
+
+            # readability
+            # article["summary"] = Document(html, url=link_url).summary(html_partial=True)
+            # plain_text = strip_tags(article["summary"])
+            # title = Document(html).short_title()
+            # if not title or title == '':
+            #     title = link_text
+            # article["short_title"] = title
+
+            #newspaper
+            config = Config()
+            config.keep_article_html = True
+            news= Article(link_url, config=config, memoize_articles=False, language='zh')
+            news.download(html)
+            news.parse()
+            article["short_title"] = news.title
+            article["summary"] = news.article_html
+            plain_text = strip_tags(news.meta_description)
+
+            if len(plain_text) >= 200:
+                article["brief"] = extract_brief(plain_text, 150, 200)
+            else:
+                article["brief"] = plain_text
+            return article
+        except:
+            logging.error(u"抓取 %s 失败, traceback: %s", link_url, traceback.print_exc())
+
+    def get_exist_article(self, link_url):
+        with get_db() as db:
+            article = db.get('select * from capture_article where raw_url= %s and site_id= %s', link_url, self.site_id)
+            return article
+
+    def __save_article(self, article):
+        with get_db() as db:
+            if article.has_key("id"):
+                db.execute('''update capture_article set short_title= %s, summary= %s, update_date = %s,
+                        brief= %s, isshow= %s where id= %s ''',
+                           article["short_title"], article["summary"], article["update_date"], article["brief"],
+                           article["id"]
+                )
+            else:
+                db.execute('''insert into capture_article
+                (short_title,summary,raw_url,site_id,create_date,update_date,isshow,brief,site_name)
+                values(%s,%s,%s,%s,%s,%s,%s,%s,%s)''',
+                           article["short_title"], article["summary"], article["raw_url"], article["site_id"],
+                           article["create_date"], article["update_date"], article["isshow"], article["brief"],
+                           article["site_name"]
+                )
+
+