博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
Scrapy实现腾讯招聘网信息爬取【Python】
阅读量:5809 次
发布时间:2019-06-18

本文共 7949 字,大约阅读时间需要 26 分钟。

一.腾讯招聘网

二.代码实现

  1.spider爬虫

1 # -*- coding: utf-8 -*- 2 import scrapy 3 from Tencent.items import TencentItem 4  5 class TencentSpider(scrapy.Spider): 6     name = 'tencent' 7     allowed_domains = ['tencent.com'] 8     base_url = 'https://hr.tencent.com/position.php?&start=' 9     offset = 010     start_urls = [base_url + str(offset)]11     def parse(self, response):12         node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']")13         for node in node_list:14             # 创建Item对象来保存信息15             item = TencentItem()16             positionName = node.xpath("./td[1]/a/text()").extract()[0]17             if len(node.xpath("./td[2]/text()")):18                 positionType = node.xpath("./td[2]/text()").extract()[0]19             else:20                 positionType = ""21             positionNumber = node.xpath("./td[3]/text()").extract()[0]22             location = node.xpath("./td[4]/text()").extract()[0]23             publishTime = node.xpath("./td[5]/text()").extract()[0]24 25             # 保存到item中26             item['positionName'] = positionName27             item['positionType'] = positionType28             item['positionNumber'] = positionNumber29             item['location'] = location30             item['publishTime'] = publishTime31 32             yield item33 34         # 判断是否需要拼接下一页路径35         if self.offset < 2770:36             self.offset += 1037             # 拼接下一页路径38             url = self.base_url + str(self.offset)39             # dont_filter=True 禁止因域名不同而过滤40             yield scrapy.Request(url, callback=self.parse)

  2.管道

1 # -*- coding: utf-8 -*- 2  3 # Define your item pipelines here 4 # 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 import json 8  9 class TencentPipeline(object):10     def __init__(self):11         self.f = open("Tencent.json", "w")12 13     def process_item(self, item, spider):14         text = json.dumps(dict(item), ensure_ascii=False) + ",\n"15         self.f.write(text)16         return item17 18     def close_spider(self, spider):19         self.f.close()

  3.实体

1 # -*- coding: utf-8 -*- 2  3 # Define here the models for your scraped items 4 # 5 # See documentation in: 6 # https://doc.scrapy.org/en/latest/topics/items.html 7  8 import scrapy 9 10 11 class TencentItem(scrapy.Item):12     # define the fields for your item here like:13     # name = scrapy.Field()14     # 职位名称15     positionName = scrapy.Field()16     # 职位类别17     positionType = scrapy.Field()18     # 人数19     positionNumber = scrapy.Field()20     # 地点21     location = scrapy.Field()22     # 发布时间23     publishTime = scrapy.Field()

三.结果【部分展示】

{
"positionName": "15605-动作RPG手游游戏活动策划", "positionType": "产品/项目类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},{
"positionName": "21645-高级法律顾问", "positionType": "职能类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},{
"positionName": "HGJ-senior legal counsel(MA)", "positionType": "职能类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},{
"positionName": "19837-电影新媒体媒介经理", "positionType": "市场类", "positionNumber": "1", "location": "北京", "publishTime": "2019-02-14"},{
"positionName": "HGJ-Legal Counsel (Technology Transactions)", "positionType": "职能类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},{
"positionName": "22989-云服务平台部--服务运营中心总监", "positionType": "技术类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},{
"positionName": "22989-经营平台产品中心web前端开发", "positionType": "技术类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},{
"positionName": "22989-视频云高级Web前端开发", "positionType": "技术类", "positionNumber": "2", "location": "深圳", "publishTime": "2019-02-14"},{
"positionName": "27554-腾讯音乐商业制片人(上海)", "positionType": "市场类", "positionNumber": "1", "location": "上海", "publishTime": "2019-02-14"},{
"positionName": "30361-天天P图图像处理后台开发(上海)", "positionType": "技术类", "positionNumber": "1", "location": "上海", "publishTime": "2019-02-14"},{
"positionName": "15573-MMORPG UE4手游资深美术3D设计(上海)", "positionType": "设计类", "positionNumber": "1", "location": "上海", "publishTime": "2019-02-14"},{
"positionName": "15573-MMORPG UE4手游3D动画设计师(上海)", "positionType": "设计类", "positionNumber": "1", "location": "上海", "publishTime": "2019-02-14"},{
"positionName": "15573-MMORPG UE4手游3D特效美术师(上海)", "positionType": "设计类", "positionNumber": "1", "location": "上海", "publishTime": "2019-02-14"},{
"positionName": "15573-MMORPG UE4手游交互设计师(上海)", "positionType": "设计类", "positionNumber": "1", "location": "上海", "publishTime": "2019-02-14"},{
"positionName": "AQ-产品安全经理(深圳)", "positionType": "职能类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},{
"positionName": "AQ-产品安全经理(广州)", "positionType": "职能类", "positionNumber": "1", "location": "广州", "publishTime": "2019-02-14"},{
"positionName": "29050-数据安全经理/专家(深圳)", "positionType": "职能类", "positionNumber": "2", "location": "深圳", "publishTime": "2019-02-14"},{
"positionName": "29050-数据安全经理/专家(北京)", "positionType": "职能类", "positionNumber": "2", "location": "北京", "publishTime": "2019-02-14"},{
"positionName": "AQ-行业合作经理(北京)", "positionType": "职能类", "positionNumber": "1", "location": "北京", "publishTime": "2019-02-14"},{
"positionName": "AQ-行业合作经理(深圳)", "positionType": "职能类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},{
"positionName": "27553-腾讯音乐人曲库运营", "positionType": "内容编辑类", "positionNumber": "2", "location": "深圳", "publishTime": "2019-02-14"},{
"positionName": "22086-体育号创作平台产品经理", "positionType": "产品/项目类", "positionNumber": "1", "location": "北京", "publishTime": "2019-02-14"},{
"positionName": "22086-体育号CP管理产品经理 ", "positionType": "产品/项目类", "positionNumber": "1", "location": "北京", "publishTime": "2019-02-14"},{
"positionName": "22086-体育号内容质量产品经理", "positionType": "产品/项目类", "positionNumber": "1", "location": "北京", "publishTime": "2019-02-14"},{
"positionName": "28297-二次元手游本地化策划(深圳)", "positionType": "产品/项目类", "positionNumber": "2", "location": "深圳", "publishTime": "2019-02-14"},{
"positionName": "GY0-腾讯云海外商务拓展", "positionType": "市场类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},{
"positionName": "PCG10-高级产品经理(工具产品方向)", "positionType": "产品/项目类", "positionNumber": "1", "location": "成都", "publishTime": "2019-02-14"},{
"positionName": "18432-策略分析师", "positionType": "产品/项目类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},{
"positionName": "18432-基金高级分析师", "positionType": "产品/项目类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},{
"positionName": "21309-在线教育运营专家/增长黑客", "positionType": "产品/项目类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},{
"positionName": "28481-高级医疗商务拓展经理(北京)", "positionType": "市场类", "positionNumber": "1", "location": "北京", "publishTime": "2019-02-14"},{
"positionName": "21882-高级医学编辑(深圳/北京)", "positionType": "内容编辑类", "positionNumber": "2", "location": "北京", "publishTime": "2019-02-14"},{
"positionName": "18402-MMO手游-平台渠道运营", "positionType": "产品/项目类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},{
"positionName": "28170-腾讯游戏直播业务管理经理(深圳)", "positionType": "产品/项目类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},{
"positionName": "28170-腾讯游戏直播内容品牌经理(深圳)", "positionType": "市场类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},{
"positionName": "PCG10-浏览器阅读中心后台开发工程师(深圳)", "positionType": "技术类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},{
"positionName": "PCG10-浏览器阅读中心前端开发工程师(深圳)", "positionType": "技术类", "positionNumber": "2", "location": "深圳", "publishTime": "2019-02-14"},

 四.详情

  

 

转载于:https://www.cnblogs.com/yszd/p/10380648.html

你可能感兴趣的文章
【SAP HANA】关于SAP HANA中带层次结构的计算视图Cacultation View创建、激活状况下在系统中生成对象的研究...
查看>>
[nodejs] nodejs开发个人博客(五)分配数据
查看>>
《Linux内核修炼之道》 之 高效学习Linux内核
查看>>
Java数据持久层框架 MyBatis之API学习九(SQL语句构建器详解)
查看>>
30分钟Git命令“从入门到放弃”
查看>>
nginx : TCP代理和负载均衡的stream模块
查看>>
MYSQL数据库间同步数据
查看>>
DevOps 前世今生 | mPaaS 线上直播 CodeHub #1 回顾
查看>>
iOS 解决UITabelView刷新闪动
查看>>
让前端小姐姐愉快地开发表单
查看>>
Dubbo笔记(四)
查看>>
Web前端JQuery入门实战案例
查看>>
java B2B2C Springboot电子商城系统- SSO单点登录之OAuth2.0 登出流程(3)
查看>>
12月26日云栖精选夜读:CDN新品发布:阿里云SCDN安全加速开放公测
查看>>
USB 通信原理
查看>>
7zZip zip RAR iOS
查看>>
date命令的详细用法!
查看>>
分布式存储ceph集群部署
查看>>
UiAutomator源码分析之UiAutomatorBridge框架
查看>>
python 开发之selenium
查看>>