我再尝试使用scrapy调用gooseeker的api,复现官网给出的爬取房屋信息的例子,结果失败了。经过1天的调试,还是没成功,我发现在在gooseeker浏览器里面是可以获取到数据的,是因为我用的gooseeker代码版本太低了吗。实在不知道什么原因。
- # -*- coding: utf-8 -*-
- import scrapy
- import os
- from urllib import request
- from urllib.parse import quote
- from lxml import etree
- class GsExtractor(object):
- def _init_(self):
- self.xslt = ""
- # 从文件读取xslt
- def setXsltFromFile(self , xsltFilePath):
- file = open(xsltFilePath , 'r' , encoding='UTF-8')
- try:
- self.xslt = file.read()
- finally:
- file.close()
- # 从字符串获得xslt
- def setXsltFromMem(self , xsltStr):
- self.xslt = xsltStr
- # 通过GooSeeker API接口获得xslt
- def setXsltFromAPI(self , APIKey , theme, middle=None, bname=None):
- apiurl = "http://www.gooseeker.com/api/getextractor?key="+ APIKey +"&theme="+quote(theme)
- if (middle):
- apiurl = apiurl + "&middle="+quote(middle)
- if (bname):
- apiurl = apiurl + "&bname="+quote(bname)
- apiconn = request.urlopen(apiurl)
- self.xslt = apiconn.read()
- # 返回当前xslt
- def getXslt(self):
- return self.xslt
- # 提取方法,入参是一个HTML DOM对象,返回是提取结果
- def extract(self , html):
- xslt_root = etree.XML(self.xslt)
- transform = etree.XSLT(xslt_root)
- result_tree = transform(html)
- return result_tree
- # 提取方法,入参是html源码,返回是提取结果
- def extractHTML(self , html):
- doc = etree.HTML(html)
- return self.extract(doc)
-
-
- class JobSpider(scrapy.Spider):
- name = 'job'
- allowed_domains = ['anjuke.com']
- start_urls = ['http://bj.zu.anjuke.com/fangyuan/p1']
- def parse(self, response):
- print("----------------------------------------------------------------------------")
- # 引用提取器
- bbsExtra =GsExtractor()
- # 设置xslt抓取规则
- bbsExtra.setXsltFromAPI("31d24931e043e2d5364d03b8ff9cc77e", "安居客_房源")
- # 调用extract方法提取所需内容
- result = bbsExtra.extractHTML(response.body)
- # 打印采集结果
- print(str(result).encode('gbk','ignore').decode('gbk'))
- # 保存采集结果
- file_path = os.getcwd() + "/anjuke-result.xml"
- open(file_path,"wb").write(result)
- # 打印结果存放路径
- print("采集结果文件:" + file_path)<img src="https://www.gooseeker.com/doc/forum.php?mod=image&aid=10175&size=300x300&key=ca32112dd7768643&nocache=yes&type=fixnone" border="0" aid="attachimg_10175" alt="">
复制代码 |
|
|
|
|
共 5 个关于本帖的回复 最后回复于 2018-10-17 15:31