python获得google自动完成下拉框文本
代码是从国外网站上找的,原来的一些参数不能用了,我调整了一下。如果觉得有用,给点鼓励。谢谢。#!/usr/bin/python
# Copyright (C) 2010 <[email protected]>
import urllib2, sys, getopt, time
import socket
from urllib import quote
from xml.etree.ElementTree import XMLParser
from threading import Thread
print "\n***************************"
print "* Google Suggest Scrapper *"
print "* Coded by ____________ *"
print "* [email protected] *"
print "***************************\n"
global i
global recursiveLevel
global outputFile
recursiveLevel = 0
outputFile = ''
def usage():
print "Usage: GoogleSuggest.py \n"
print " -k: Keywords: Keywords to use (separated by #)"
print " -f: File: File to read keywords from (overrides -k)"
print " -r: Recursive level (0-5): Use suggested keywords to get more keywords up to -r times "
print " -t: Threads: Number of threads (default 5)"
print " -o: Output file: Save keywords found to file\n\n"
print "Examples:\n"
print " GoogleSuggest.py -k keyword1"
print " GoogleSuggest.py -k keyword1#keyword2#keyword3 -r 1"
print " GoogleSuggest.py -f keyword_file.txt -o keywords_found.txt -t 10"
sys.exit()
class do(Thread):
def __init__( self, keyword ):
Thread.__init__(self)
self.keyword = keyword
self.result = []
def run(self):
global i
tmp = []
tmp2 = []
for level in range(0, recursiveLevel+1):
if level == 0:
keyword = self.keyword.split()
tmp = self.getKeywords(keyword)
else:
seedkeyword = keyword
while len(tmp):
keyword = tmp.pop(0)
if keyword == seedkeyword:
continue
else:
if level<recursiveLevel:
tmp2 = self.getKeywords(keyword)
else:
self.getKeywords(keyword)
tmp = tmp2
time.sleep(0)
i = i-1
def getKeywords(self, keyword):
tmp = []
url = 'http://clients1.google.com/complete/search?output=toolbar&q='+quote(keyword)
print url
response = urllib2.urlopen(url)
cont = response.read()
x = XMLParser()
x.feed(cont)
tree = x.close()
for e in tree.findall('CompleteSuggestion'):
#self.result.append([, ])
self.result.append([, ])
tmp.append(e.find('suggestion').get('data'))
return tmp
def startThreads(keywords):
global i
i = 0
threads = []
ret = []
while len(keywords):
try:
if i<th:
keyword = keywords.pop(0)
i = i+1
thread = do(keyword)
thread.start()
threads.append(thread)
except KeyboardInterrupt:
print 'Suspended by user...\n'
sys.exit()
for t in threads:
t.join()
for r in t.result:
ret.append(r)
output(ret)
def output(ret):
global outputFile
output = []
while len(ret):
data = ret.pop(0)
output.append(data+'\n')
print data
if outputFile:
try:
f = open(outputFile,'w')
except:
print 'Can\'t open output file\n'
sys.exit()
f.writelines(output)
f.close()
def run(argv):
global th
global recursiveLevel
global outputFile
th = 5
if len(sys.argv) < 3:
usage()
try:
opts, args = getopt.getopt(argv,'k:f:r:t:o:')
except getopt.GetoptError:
usage()
for opt,arg in opts :
if opt == '-k':
inputKeywords = arg
elif opt == '-f':
try:
inputFile = open(arg, "r")
except:
print 'Can\'t open keywords file\n'
sys.exit()
elif opt == '-r':
recursiveLevel = int(arg)
elif opt == '-t':
th = arg
elif opt == '-o':
outputFile = arg
try:
inputFile
except NameError:
inputKeywords = inputKeywords.split('#')
else:
inputKeywords = inputFile.readlines()
startThreads(inputKeywords)
if __name__ == "__main__":
try:
run(sys.argv)
except KeyboardInterrupt:
print "Ctrl+C Exit By USER...\n"
sys.exit()
能看懂一些模块有些不懂。。。 python的urllib2貌似已经升级了,增加了很多功能。
页:
[1]