小项(肥象)

人总是嫌弃自己的大脑运行不够快.所以发明了电脑......
我脑子总是记不住很多东西,So我开设了Blog!!!

Sqlite 基本用法

> sqlite dbname #打开数据库

sqlite> select * from table_name where *; # 查询数据 {其实跟MySQL没什么不同的}

sqlite> select count(*) from sqlite_master where name='**' ; #查询某个表是否存在

sqlite> delete from table_name; #清空某个表{好像是可以自增ID归零的}

sqlite> VACUUM table_name; #功能等效于数据库的压缩功能

sqlite> .exit #推出sqlite

本文章由『小项-怪物猪』于June 14, 2011发表在 FreeBSD栏目 | 暂时没有评论

测试脚本的运行时间

[buysz]$ time touch {1..10001}.txt

real 0m0.091s
user 0m0.024s
sys 0m0.064s

real 程序实际运行时间
user 用户态运行时间
sys 内核态运行时间

本文章由『小项-怪物猪』于May 31, 2011发表在 临时文件栏目 | 暂时没有评论

Shell的一个小正则

[buysz]$ touch {1..100}
[buysz]$ ls
1 14 2 25 30 36 41 47 52 58 63 69 74 8 85 90 96
10 15 20 26 31 37 42 48 53 59 64 7 75 80 86 91 97
100 16 21 27 32 38 43 49 54 6 65 70 76 81 87 92 98
11 17 22 28 33 39 44 5 55 60 66 71 77 82 88 93 99
12 18 23 29 34 4 45 50 56 61 67 72 78 83 89 94
13 19 24 3 35 40 46 51 57 62 68 73 79 84 9 95

[buysz]$ wget http://www.52dpet.com/images/vip_{say,title}.gif
[buysz]$ ls
vip_say.gif vip_title.gif

本文章由『小项-怪物猪』于May 31, 2011发表在 临时文件栏目 | 暂时没有评论

分离了一个js的动态logo For Google

XML/HTML代码

  1. <script>  
  2. window.google={kEI:"OvfJTYuEH8SxcZ3t_aUP",kEXPI:"17259,28505,28555,29685,29795,29810,30035,30107,30152",kCSI:{e:"17259,28505,28555,29685,29795,29810,30035,30107,30152",ei:"OvfJTYuEH8SxcZ3t_aUP",expi:"17259,28505,28555,29685,29795,29810,30035,30107,30152"},   
  3. Toolbelt:{}};   
  4. </script>  
  5. <style>  
  6.     #hplogo{background:white;cursor:pointer;height:156px;position:relative;width:403px}   
  7.     #hplogo div{pointer-events:none;position:absolute}   
  8. </style>  
  9. <div id=hplogo>  
  10. <img src="http://www.google.com.hk/logos/2011/graham11-hp-start.png" border=0 />  
  11. </div>  
  12. <script>  
  13. (function(){try{if(!google.doodle)google.doodle={};var d=[[307,48,88,89],[307,48,89,89],[307,48,91,89],[305,49,93,89],[305,50,93,88],[305,50,93,88],[306,52,92,86],[305,53,93,84],[305,54,94,83],[306,54,93,83],[307,54,92,83],[307,54,92,83],[308,54,90,83],[308,54,90,83],[306,53,91,84],[306,53,91,84],[308,53,90,84],[308,53,90,84],[305,53,92,84],[305,52,92,85],[306,52,91,85],[308,51,88,87,1],[308,50,88,88],[308,49,88,88],[307,49,89,88],[307,50,89,87],[308,51,89,86],[307,54,90,83],[307,57,90,80],[306,58,92,79],[306,58,92,79],[305,60,92,77],[302,61,95,76],[302,63,95,74],[302,51,96,86],[302,66,98,71],[304,67,96,69],[301,63,96,74],[301,58,93,79],[291,52,94,85],[288,50,71,88],[285,43,76,95],[285,37,70,101],[281,29,55,109],[278,20,58,119],[278,20,55,119,1],[277,12,121,127],[271,2,122,138],[267,1,126,139],[264,0,136,140],[260,0,141,140],[255,0,148,140],[252,0,151,140],[249,2,121,138],[247,3,123,137],[246,3,123,137],[246,2,124,137],[258,2,112,137],[263,2,106,137],[263,2,106,137],[262,2,103,137],[260,2,104,136],[260,2,104,137,1],[268,2,98,137],[267,2,99,137],[266,2,97,137],[266,3,96,136],[264,3,99,136],[263,3,100,136],[261,3,100,136],[259,2,138,137],[254,2,126,137],[247,2,101,136],[240,2,108,136],[238,1,110,137],[230,1,118,138],[220,15,128,124],[211,18,137,121],[205,43,102,96],[202,45,104,93],[200,38,97,101],[198,38,104,101,1],[197,39,107,100],[197,39,112,100],[213,39,94,110],[212,40,95,111],[211,41,97,111],[209,42,99,112],[209,43,98,112],[213,43,87,112],[213,42,83,113],[211,40,86,109],[211,38,86,103],[211,37,88,112],[211,20,186,131],[213,27,167,122],[212,44,87,105],[210,44,88,98],[195,44,106,98],[189,44,110,98],[182,46,117,99],[173,44,118,96,1],[161,43,130,99],[154,42,137,97],[153,42,137,97],[153,42,137,97],[152,41,137,98],[151,41,137,97],[149,41,145,97],[148,25,144,114],[148,13,144,126],[141,12,153,127],[115,11,173,128],[108,7,180,133],[108,4,180,136],[108,3,176,137,1],[108,1,161,139],[105,1,235,138],[103,1,295,148],[103,0,277,149],[108,0,234,137],[101,0,232,137],[99,0,135,139],[95,0,244,139],[81,0,152,139],[69,0,164,139,1],[66,0,169,139],[65,0,170,139],[63,0,168,138],[61,0,159,138],[35,0,304,139],[19,0,189,140],[18,11,138,129],[18,11,137,129],[18,11,137,128],[18,6,135,133],[7,4,146,136],[6,4,147,136],[3,4,150,136,1],[3,5,150,135],[3,8,150,132],[4,6,394,145],[12,6,388,145],[11,8,389,144],[11,8,387,144],[11,8,387,143,1],[10,8,113,131],[11,8,111,131],[10,9,112,130],[12,9,116,130],[12,9,111,130],[12,9,111,130],[12,9,110,131],[12,34,113,106],[13,35,110,104]],e=d.length,f,g,h,i,j=-1,k=function(){google.nav&&google.nav.go?google.nav.go(""):window.location.href=""},l=function(){var a=d[f],c=document.getElementById("hplogo");if(c&&a[0]){var b=document.createElement("div");b.id="hplogo"+f;b.style.left=a[0]+"px";b.style.top=a[1]+"px";b.style.width=a[2]+"px";b.style.height=a[3]+"px";b.style.background="url(http://www.google.com.hk/logos/2011/graham11-hp-sprite.png) no-repeat "+-g+"px "+-h+"px";b.onmousedown=k;a[3]>i&&(i=a[3]);a[4]?(g=0,h+=i,i=0):g+=a[2];c.appendChild(b);++f;f< e&&(j=window.setTimeout(l,83))}},m=function(){google.doodle.a=!1;i=h=g=f=0;j!=-1&&(window.clearTimeout(j),j=-1);for(var a=0;a< e;++a){var c=document.getElementById("hplogo"+a);c&&c.parentNode&&c.parentNode.removeChild(c)}j=window.setTimeout(l,83)};if(!google.doodle.a){google.doodle.a=!0;var n=document.createElement("img");n.addEventListener?n.addEventListener("load",m,!1):n.attachEvent("onload",m);n.src="http://www.google.com.hk/logos/2011/graham11-hp-sprite.png"}}catch(o){google.ml(o,!1,{cause:"DOODLE"})};})();</script>  

本文章由『小项-怪物猪』于May 11, 2011发表在 临时文件栏目 | 暂时没有评论

新写的Python 采集

Python代码

  1. #!/usr/bin/env python   
  2. #-*-coding:utf-8-*-   
  3. #encoding=utf-8   
  4.   
  5. #--作者:小项--   
  6. #--预览:http://www.20hotel.com/news--   
  7.   
  8. import sys;   
  9. import os;   
  10. import re;   
  11. import random;   
  12. import urllib2;   
  13. import time;   
  14. import datetime;   
  15. #import socket;   
  16. import MySQLdb as mysql;   
  17.   
  18. reload(sys)   
  19.   
  20. sys.setdefaultencoding('utf-8')   
  21.   
  22. #--转到目录--   
  23. os.chdir('img')   
  24.   
  25. #urllib2.socket.setdefaulttimeout(15)   
  26.   
  27. User = 'username'  
  28. Passwd = 'password'  
  29. Host = 'localhost'  
  30. Db = 'dbname'  
  31.   
  32. home = "http://www.8264.com/"  
  33.   
  34. #--链接数据库--   
  35. contents = mysql.connect(user=User,passwd=Passwd,host=Host,db=Db,charset='utf8').cursor()   
  36.   
  37. lsid = []   
  38.   
  39. pnext = []   
  40.   
  41. for sid in xrange(1,100,10):   
  42.     lsid.append(str(sid))   
  43.   
  44. print "进行列表分段",lsid,"完成."   
  45. for tid in reversed(xrange(2,len(lsid)+1)):   
  46.     for i in reversed(xrange(int(lsid[(int(tid)-2):(int(tid)-1)][0]),int(lsid[(int(tid)-1):int(tid)][0]))):   
  47.         #print i   
  48.         #==进行列表获取==#   
  49.         request = urllib2.Request("http://www.8264.com/portal-list-catid-251-page-"+str(i)+".html")   
  50.         request.add_header('User-Agent','Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)')   
  51.         for u in reversed(re.findall('<h2><a href=\"(.*?)\" title=\'',re.findall('<div class=\"title_8264\">(.*?)<div class=\"pg\">',urllib2.urlopen(request).read(),re.DOTALL)[0],re.DOTALL)):   
  52.             #print u   
  53.             #--获取内容页面--   
  54.             newsurl = urllib2.Request(u)   
  55.             newsurl.add_header('User-Agent','Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)')   
  56.             news = urllib2.urlopen(newsurl).read()   
  57.             time.sleep(int(random.uniform(1,5)))   
  58.             #--获取标题--   
  59.             title = re.findall('<div class=\"newstitle\">(.*?)<\/div>',news,re.DOTALL)   
  60.             #--获取时间--   
  61.             dates = list(eval(re.sub('\,0',',',re.sub(':| |-',',',re.findall('<td align=\"center\" valign=\"middle\">.*?<div style=\"line-height:1.8; text-align:center;\">\xcc\xed\xbc\xd3\xca\xb1\xbc\xe4\xa3\xba(.*?) ',news,re.DOTALL)[0]))))   
  62.             #--进行时间格式化--   
  63.             #--2011-05-10 08:19 to 1305010787.029--   
  64.             ttime = datetime.datetime(dates[0],dates[1],dates[2],dates[3],dates[4])   
  65.             ptime = time.mktime(ttime.timetuple())   
  66.   
  67.             #--获取作者--   
  68.             athour = re.sub('<.*?>','',re.findall(' \xd7\xf7\xd5\xdf\xa3\xba(.*?)<br \/><a',news,re.DOTALL)[0])   
  69.   
  70.             #--获取分页链接--   
  71.             page = re.findall('<div class=\"pg\">(.*?)<\/div>',news,re.DOTALL)   
  72.             if page != []:   
  73.                 pnext = re.findall('<a href=\"(.*?)\">[0-9]*<\/a>',page[0],re.DOTALL)   
  74.                 one_img = []   
  75.                 one_txt = re.sub('<[a|A].*?>|<\/[a|A]>','',re.findall('<div class=\"newstext\">(.*?)<\/div>',news,re.DOTALL)[0])   
  76.                 newstxt = re.sub('[http:\/\/image.8264.com\/portal\/[0-9]*\/[0-9]*\/|http:\/\/image.8264.com\/portal\/photo\/[0-9]*\/[0-9]*\/]','',one_txt)   
  77.                 one_img.extend(re.findall('<IMG src=\"(.*?)\">',one_txt,re.DOTALL))   
  78.                 for one_dimg in one_img:   
  79.                     #--下载文章内图片--   
  80.                     one_yscurl = 'wget -q '+one_dimg   
  81.                     os.system(one_yscurl)   
  82.                 for p in pnext:   
  83.                     #print p,"\n"   
  84.                     more_img = []   
  85.                     morepage = urllib2.Request(p)   
  86.                     morepage.add_header('User-Agent','Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)')   
  87.                     pnewtxt = urllib2.urlopen(morepage).read()   
  88.                     txt = re.sub('<[a|A].*?>|<\/[a|A]>','',re.findall('<div class=\"newstext\">(.*?)<\/div>',pnewtxt,re.DOTALL)[0])   
  89.                     #--得到入库的内容--   
  90.                     ntxt = re.sub('[http:\/\/image.8264.com\/portal\/[0-9]*\/[0-9]*\/|http:\/\/image.8264.com\/portal\/photo\/[0-9]*\/[0-9]*\/]','',txt)   
  91.                     #--处理内容中的图片--   
  92.                     more_img.extend(re.findall('<IMG src=\"(.*?)\">',txt,re.DOTALL))   
  93.                     for more_dimg in more_img:   
  94.                         more_syscurl = 'wget -q '+more_dimg   
  95.                         os.system(more_syscurl)   
  96.   
  97.                     newstxt += ntxt   
  98.                 texts = title[0].decode('gbk','ignore').encode('utf-8'),newstxt.decode('gbk','ignore').encode('utf-8'),athour.decode('gbk','ignore').encode('utf-8'),ptime   
  99.                 #--进行数据插入--   
  100.                 contents.execute("INSERT INTO `dbname`.`table_name` (`aid`, `class_id`, `title`, `content`, `author`, `order`, `state_radio`, `time`, `view_num`, `img`, `CityID`) VALUES (NULL, '2', %s, %s, %s, '0', '2', %s, '0', '', '53');",texts);   
  101.                 print athour.decode('gbk','ignore').encode('utf-8'),"在",tuple(dates),"发表的",title[0].decode('gbk','ignore').encode('utf-8'),"发布成功!"   
  102.                 time.sleep(int(random.uniform(30,90)))   
  103.             else:   
  104.                 #pass   
  105.                 only_img = []   
  106.                 only_txt = re.sub('<[a|A].*?>|<\/[a|A]>','',re.findall('<div class=\"newstext\">(.*?)<\/div>',news,re.DOTALL)[0])   
  107.                 newstxt = re.sub('[http:\/\/image.8264.com\/portal\/[0-9]*\/[0-9]*\/|http:\/\/image.8264.com\/portal\/photo\/[0-9]*\/[0-9]*\/]','',only_txt)   
  108.                 only_img.extend(re.findall('<IMG src=\"(.*?)\">',only_txt,re.DOTALL))   
  109.                 for only_img in only_img:   
  110.                         only_syscurl = 'wget -q '+only_img   
  111.                         os.system(only_syscurl)   
  112.                 texts = title[0].decode('gbk','ignore').encode('utf-8'),newstxt.decode('gbk','ignore').encode('utf-8'),athour.decode('gbk','ignore').encode('utf-8'),ptime   
  113.                 contents.execute("INSERT INTO `dbname`.`table_name` (`aid`, `class_id`, `title`, `content`, `author`, `order`, `state_radio`, `time`, `view_num`, `img`, `CityID`) VALUES (NULL, '2', %s, %s, %s, '0', '2', %s, '0', '', '53');",texts);   
  114.                 print athour.decode('gbk','ignore').encode('utf-8'),"在",tuple(dates),"发表的",title[0].decode('gbk','ignore').encode('utf-8'),"发布成功!"   
  115.                 time.sleep(int(random.uniform(30,90)))   
  116.   
  117.         print "第",i,"页采集完成.休息一下,进入下一页采集."   
  118.         #--停顿一会--   
  119.         time.sleep(int(random.uniform(1200,3200)))   
  120. #--关闭数据库连接--   
  121. contents.close();  

本文章由『小项-怪物猪』于May 10, 2011发表在 Ubuntu栏目 | 暂时没有评论