小项(肥象)
人总是嫌弃自己的大脑运行不够快.所以发明了电脑......
我脑子总是记不住很多东西,So我开设了Blog!!!
测试脚本的运行时间
[buysz]$ time touch {1..10001}.txt
real 0m0.091s
user 0m0.024s
sys 0m0.064s
real 程序实际运行时间
user 用户态运行时间
sys 内核态运行时间
Shell的一个小正则
[buysz]$ touch {1..100}
[buysz]$ ls
1 14 2 25 30 36 41 47 52 58 63 69 74 8 85 90 96
10 15 20 26 31 37 42 48 53 59 64 7 75 80 86 91 97
100 16 21 27 32 38 43 49 54 6 65 70 76 81 87 92 98
11 17 22 28 33 39 44 5 55 60 66 71 77 82 88 93 99
12 18 23 29 34 4 45 50 56 61 67 72 78 83 89 94
13 19 24 3 35 40 46 51 57 62 68 73 79 84 9 95
[buysz]$ wget http://www.52dpet.com/images/vip_{say,title}.gif
[buysz]$ ls
vip_say.gif vip_title.gif
分离了一个js的动态logo For Google
XML/HTML代码
- <script>
- window.google={kEI:"OvfJTYuEH8SxcZ3t_aUP",kEXPI:"17259,28505,28555,29685,29795,29810,30035,30107,30152",kCSI:{e:"17259,28505,28555,29685,29795,29810,30035,30107,30152",ei:"OvfJTYuEH8SxcZ3t_aUP",expi:"17259,28505,28555,29685,29795,29810,30035,30107,30152"},
- Toolbelt:{}};
- </script>
- <style>
- #hplogo{background:white;cursor:pointer;height:156px;position:relative;width:403px}
- #hplogo div{pointer-events:none;position:absolute}
- </style>
- <div id=hplogo>
- <img src="http://www.google.com.hk/logos/2011/graham11-hp-start.png" border=0 />
- </div>
- <script>
- (function(){try{if(!google.doodle)google.doodle={};var d=[[307,48,88,89],[307,48,89,89],[307,48,91,89],[305,49,93,89],[305,50,93,88],[305,50,93,88],[306,52,92,86],[305,53,93,84],[305,54,94,83],[306,54,93,83],[307,54,92,83],[307,54,92,83],[308,54,90,83],[308,54,90,83],[306,53,91,84],[306,53,91,84],[308,53,90,84],[308,53,90,84],[305,53,92,84],[305,52,92,85],[306,52,91,85],[308,51,88,87,1],[308,50,88,88],[308,49,88,88],[307,49,89,88],[307,50,89,87],[308,51,89,86],[307,54,90,83],[307,57,90,80],[306,58,92,79],[306,58,92,79],[305,60,92,77],[302,61,95,76],[302,63,95,74],[302,51,96,86],[302,66,98,71],[304,67,96,69],[301,63,96,74],[301,58,93,79],[291,52,94,85],[288,50,71,88],[285,43,76,95],[285,37,70,101],[281,29,55,109],[278,20,58,119],[278,20,55,119,1],[277,12,121,127],[271,2,122,138],[267,1,126,139],[264,0,136,140],[260,0,141,140],[255,0,148,140],[252,0,151,140],[249,2,121,138],[247,3,123,137],[246,3,123,137],[246,2,124,137],[258,2,112,137],[263,2,106,137],[263,2,106,137],[262,2,103,137],[260,2,104,136],[260,2,104,137,1],[268,2,98,137],[267,2,99,137],[266,2,97,137],[266,3,96,136],[264,3,99,136],[263,3,100,136],[261,3,100,136],[259,2,138,137],[254,2,126,137],[247,2,101,136],[240,2,108,136],[238,1,110,137],[230,1,118,138],[220,15,128,124],[211,18,137,121],[205,43,102,96],[202,45,104,93],[200,38,97,101],[198,38,104,101,1],[197,39,107,100],[197,39,112,100],[213,39,94,110],[212,40,95,111],[211,41,97,111],[209,42,99,112],[209,43,98,112],[213,43,87,112],[213,42,83,113],[211,40,86,109],[211,38,86,103],[211,37,88,112],[211,20,186,131],[213,27,167,122],[212,44,87,105],[210,44,88,98],[195,44,106,98],[189,44,110,98],[182,46,117,99],[173,44,118,96,1],[161,43,130,99],[154,42,137,97],[153,42,137,97],[153,42,137,97],[152,41,137,98],[151,41,137,97],[149,41,145,97],[148,25,144,114],[148,13,144,126],[141,12,153,127],[115,11,173,128],[108,7,180,133],[108,4,180,136],[108,3,176,137,1],[108,1,161,139],[105,1,235,138],[103,1,295,148],[103,0,277,149],[108,0,234,137],[101,0,232,137],[99,0,135,139],[95,0,244,139],[81,0,152,139],[69,0,164,139,1],[66,0,169,139],[65,0,170,139],[63,0,168,138],[61,0,159,138],[35,0,304,139],[19,0,189,140],[18,11,138,129],[18,11,137,129],[18,11,137,128],[18,6,135,133],[7,4,146,136],[6,4,147,136],[3,4,150,136,1],[3,5,150,135],[3,8,150,132],[4,6,394,145],[12,6,388,145],[11,8,389,144],[11,8,387,144],[11,8,387,143,1],[10,8,113,131],[11,8,111,131],[10,9,112,130],[12,9,116,130],[12,9,111,130],[12,9,111,130],[12,9,110,131],[12,34,113,106],[13,35,110,104]],e=d.length,f,g,h,i,j=-1,k=function(){google.nav&&google.nav.go?google.nav.go(""):window.location.href=""},l=function(){var a=d[f],c=document.getElementById("hplogo");if(c&&a[0]){var b=document.createElement("div");b.id="hplogo"+f;b.style.left=a[0]+"px";b.style.top=a[1]+"px";b.style.width=a[2]+"px";b.style.height=a[3]+"px";b.style.background="url(http://www.google.com.hk/logos/2011/graham11-hp-sprite.png) no-repeat "+-g+"px "+-h+"px";b.onmousedown=k;a[3]>i&&(i=a[3]);a[4]?(g=0,h+=i,i=0):g+=a[2];c.appendChild(b);++f;f< e&&(j=window.setTimeout(l,83))}},m=function(){google.doodle.a=!1;i=h=g=f=0;j!=-1&&(window.clearTimeout(j),j=-1);for(var a=0;a< e;++a){var c=document.getElementById("hplogo"+a);c&&c.parentNode&&c.parentNode.removeChild(c)}j=window.setTimeout(l,83)};if(!google.doodle.a){google.doodle.a=!0;var n=document.createElement("img");n.addEventListener?n.addEventListener("load",m,!1):n.attachEvent("onload",m);n.src="http://www.google.com.hk/logos/2011/graham11-hp-sprite.png"}}catch(o){google.ml(o,!1,{cause:"DOODLE"})};})();</script>
新写的Python 采集
Python代码
- #!/usr/bin/env python
- #-*-coding:utf-8-*-
- #encoding=utf-8
- #--作者:小项--
- #--预览:http://www.20hotel.com/news--
- import sys;
- import os;
- import re;
- import random;
- import urllib2;
- import time;
- import datetime;
- #import socket;
- import MySQLdb as mysql;
- reload(sys)
- sys.setdefaultencoding('utf-8')
- #--转到目录--
- os.chdir('img')
- #urllib2.socket.setdefaulttimeout(15)
- User = 'username'
- Passwd = 'password'
- Host = 'localhost'
- Db = 'dbname'
- home = "http://www.8264.com/"
- #--链接数据库--
- contents = mysql.connect(user=User,passwd=Passwd,host=Host,db=Db,charset='utf8').cursor()
- lsid = []
- pnext = []
- for sid in xrange(1,100,10):
- lsid.append(str(sid))
- print "进行列表分段",lsid,"完成."
- for tid in reversed(xrange(2,len(lsid)+1)):
- for i in reversed(xrange(int(lsid[(int(tid)-2):(int(tid)-1)][0]),int(lsid[(int(tid)-1):int(tid)][0]))):
- #print i
- #==进行列表获取==#
- request = urllib2.Request("http://www.8264.com/portal-list-catid-251-page-"+str(i)+".html")
- request.add_header('User-Agent','Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)')
- for u in reversed(re.findall('<h2><a href=\"(.*?)\" title=\'',re.findall('<div class=\"title_8264\">(.*?)<div class=\"pg\">',urllib2.urlopen(request).read(),re.DOTALL)[0],re.DOTALL)):
- #print u
- #--获取内容页面--
- newsurl = urllib2.Request(u)
- newsurl.add_header('User-Agent','Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)')
- news = urllib2.urlopen(newsurl).read()
- time.sleep(int(random.uniform(1,5)))
- #--获取标题--
- title = re.findall('<div class=\"newstitle\">(.*?)<\/div>',news,re.DOTALL)
- #--获取时间--
- dates = list(eval(re.sub('\,0',',',re.sub(':| |-',',',re.findall('<td align=\"center\" valign=\"middle\">.*?<div style=\"line-height:1.8; text-align:center;\">\xcc\xed\xbc\xd3\xca\xb1\xbc\xe4\xa3\xba(.*?) ',news,re.DOTALL)[0]))))
- #--进行时间格式化--
- #--2011-05-10 08:19 to 1305010787.029--
- ttime = datetime.datetime(dates[0],dates[1],dates[2],dates[3],dates[4])
- ptime = time.mktime(ttime.timetuple())
- #--获取作者--
- athour = re.sub('<.*?>','',re.findall(' \xd7\xf7\xd5\xdf\xa3\xba(.*?)<br \/><a',news,re.DOTALL)[0])
- #--获取分页链接--
- page = re.findall('<div class=\"pg\">(.*?)<\/div>',news,re.DOTALL)
- if page != []:
- pnext = re.findall('<a href=\"(.*?)\">[0-9]*<\/a>',page[0],re.DOTALL)
- one_img = []
- one_txt = re.sub('<[a|A].*?>|<\/[a|A]>','',re.findall('<div class=\"newstext\">(.*?)<\/div>',news,re.DOTALL)[0])
- newstxt = re.sub('[http:\/\/image.8264.com\/portal\/[0-9]*\/[0-9]*\/|http:\/\/image.8264.com\/portal\/photo\/[0-9]*\/[0-9]*\/]','',one_txt)
- one_img.extend(re.findall('<IMG src=\"(.*?)\">',one_txt,re.DOTALL))
- for one_dimg in one_img:
- #--下载文章内图片--
- one_yscurl = 'wget -q '+one_dimg
- os.system(one_yscurl)
- for p in pnext:
- #print p,"\n"
- more_img = []
- morepage = urllib2.Request(p)
- morepage.add_header('User-Agent','Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)')
- pnewtxt = urllib2.urlopen(morepage).read()
- txt = re.sub('<[a|A].*?>|<\/[a|A]>','',re.findall('<div class=\"newstext\">(.*?)<\/div>',pnewtxt,re.DOTALL)[0])
- #--得到入库的内容--
- ntxt = re.sub('[http:\/\/image.8264.com\/portal\/[0-9]*\/[0-9]*\/|http:\/\/image.8264.com\/portal\/photo\/[0-9]*\/[0-9]*\/]','',txt)
- #--处理内容中的图片--
- more_img.extend(re.findall('<IMG src=\"(.*?)\">',txt,re.DOTALL))
- for more_dimg in more_img:
- more_syscurl = 'wget -q '+more_dimg
- os.system(more_syscurl)
- newstxt += ntxt
- texts = title[0].decode('gbk','ignore').encode('utf-8'),newstxt.decode('gbk','ignore').encode('utf-8'),athour.decode('gbk','ignore').encode('utf-8'),ptime
- #--进行数据插入--
- contents.execute("INSERT INTO `dbname`.`table_name` (`aid`, `class_id`, `title`, `content`, `author`, `order`, `state_radio`, `time`, `view_num`, `img`, `CityID`) VALUES (NULL, '2', %s, %s, %s, '0', '2', %s, '0', '', '53');",texts);
- print athour.decode('gbk','ignore').encode('utf-8'),"在",tuple(dates),"发表的",title[0].decode('gbk','ignore').encode('utf-8'),"发布成功!"
- time.sleep(int(random.uniform(30,90)))
- else:
- #pass
- only_img = []
- only_txt = re.sub('<[a|A].*?>|<\/[a|A]>','',re.findall('<div class=\"newstext\">(.*?)<\/div>',news,re.DOTALL)[0])
- newstxt = re.sub('[http:\/\/image.8264.com\/portal\/[0-9]*\/[0-9]*\/|http:\/\/image.8264.com\/portal\/photo\/[0-9]*\/[0-9]*\/]','',only_txt)
- only_img.extend(re.findall('<IMG src=\"(.*?)\">',only_txt,re.DOTALL))
- for only_img in only_img:
- only_syscurl = 'wget -q '+only_img
- os.system(only_syscurl)
- texts = title[0].decode('gbk','ignore').encode('utf-8'),newstxt.decode('gbk','ignore').encode('utf-8'),athour.decode('gbk','ignore').encode('utf-8'),ptime
- contents.execute("INSERT INTO `dbname`.`table_name` (`aid`, `class_id`, `title`, `content`, `author`, `order`, `state_radio`, `time`, `view_num`, `img`, `CityID`) VALUES (NULL, '2', %s, %s, %s, '0', '2', %s, '0', '', '53');",texts);
- print athour.decode('gbk','ignore').encode('utf-8'),"在",tuple(dates),"发表的",title[0].decode('gbk','ignore').encode('utf-8'),"发布成功!"
- time.sleep(int(random.uniform(30,90)))
- print "第",i,"页采集完成.休息一下,进入下一页采集."
- #--停顿一会--
- time.sleep(int(random.uniform(1200,3200)))
- #--关闭数据库连接--
- contents.close();