import scrapy from scrapy.spiders.crawl import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from ..items import RentItem class RentspiderSpider(CrawlSpider): name = 'rentspider' allowed_domains = ['hz.58.com'] start_urls = ['https://hz.58.com/caihongcheng/chuzu/pn1'] rules = ( Rule(LinkExtractor(allow='https://hz.58.com/caihongcheng/chuzu/pn\d+.*'), follow=True), # Rule(LinkExtractor(allow= r'/hezu/*.shtml?houseId=\d+.*',restrict_xpaths = ['//ul[@class="house-list"]//a']),follow=False,callback='pars_hezu'), # Rule(LinkExtractor(allow= r'/hezu/\d+x.shtml?houseId=\d+.*'),follow=False,callback='parse_hezu'), # Rule(LinkExtractor(allow=r'/hezu/50814053918142x.shtml?houseId=2582157217996813.*'), follow=False,callback='parse_hezu'), # Rule( # LinkExtractor(allow=r'/zufang/\d+x.shtml\?houseId=\d+.*', restrict_xpaths=['//ul[@class="house-list"]//a']), # follow=False, callback='parse_zufang'), # Rule(LinkExtractor(allow=r'/hezu/\d+x.shtml\?houseId=\d+.*', restrict_xpaths=['//ul[@class="house-list"]//a']), # follow=False, callback='parse_zufang', ), Rule(LinkExtractor(allow=r'/zufang/\d+x.shtml\?houseId=\d+.*'), follow=False, callback='parse_zufang'), Rule(LinkExtractor(allow=r'/hezu/\d+x.shtml\?houseId=\d+.*'), follow=False, callback='parse_zufang', ), # : // hz.58.com / hezu / 50659210723596x.shtml?houseId=2562337288563716 & shangquan=caihongcheng & shangquanId=13477 & dataSource=2 & tid=7b8da1d2-4d4c-401c-a6a6-6bbde41696e5 & legourl= // legoclick.58.com / jump?target=szqCXB3draOWUvYfpARlIitdnjmdrHcQnj0zn1NOPM980v6YUykKuaYYPyR6nH9vnzY3uh7bsHwhmymVrHTYnzd6uhm1uAR-PHuhnW0KP1bYnHnYrjm3PWn3P19KTHcdPWc1n10zrj9dPWnLnHmKnW9QnjTKnW9QnjTKnikQn9DLrik3PikQn1ELPkDQPWNvrHbOnjD1njnzTHDKwbnVPHWM_oeG9cbVNDnVOuacOlXxlppylpAdTHDKnE76UAQxuARhmgR_I7qWIZFtniQhuy7YIgF-gvF60vRxmyQ_gvw-uh7dUZwtnEDVnEDKnHNYPjD1rjbvnWckn1bknHbdnkDvTEDQTywBnjbkPj--syEkujTVPjcdPzd6PywWsHDLPW6hmvu6P19LuEDQPHEYnHn3rHmzn1DzrHEQPHm3THDdPjEQn19OPWczrjndrjDYPjcKTHD_nHcKTHT8nHEYrjbkrjcKTiYKTEDKubP7UdRVid7EPvRDHR6zRYVZnARJNHYqTHDQPi3QrHN8nHnvsWn1THTKnTDKnikQn97exEDQnjT1nkDQnjTznWTdTHIBrAw6nyEzsHwbPAnVPjTQmzd6PhDvsHuBmhw-PjDvrHu-PEDKnTDKTHTKP1b_rjN_nHnYP10KnE78IyQ_THE3m1Tznj76uWEYrAEQrHD # : // hz.58.com / zufang / 50541694381583x.shtml?houseId=2547295196438538 & shangquan=caihongcheng & shangquanId=13477 & dataSource=0 & iuType=j_1 & PGTID=0d3090a7-034a-51cf-9afe-2277515ef0f7 & ClickID=2 # : // hz.58.com / zufang / 49422523381159x.shtml?houseId=2404041308351491 & shangquan=caihongcheng & shangquanId=13477 & dataSource=0 & iuType=j_1 & PGTID=0d3090a7-034a-53b8-d4a6-8c2f7d365374 & ClickID=15 ) def parse_hezu(self, response): print('*' * 200) print(response.text) return def parse_zufang(self, response): print('*' * 200) title = response.css('.house-title h1::text').get() price = response.css('.house-basic-info .house-basic-right .strongbox::text').get().strip() rent_style = response.css( '.house-basic-info .house-basic-right .house-basic-desc .house-desc-item li span:nth-child(2)::text').get() house_style_list = response.css( '.house-basic-info .house-basic-right .house-basic-desc .house-desc-item li:nth-child(2) span:nth-child(2)::text').get() # print(type(house_style_list)) house_style = house_style_list.split()[0] house_deco = house_style_list.split()[3] house_area = house_style_list.split()[1] + house_style_list.split()[2] house_location = response.css( '.house-basic-info .house-basic-right .house-basic-desc .house-desc-item li:nth-child(4) span:nth-child(2) a::text').get().strip() address = response.css( '.house-basic-info .house-basic-right .house-basic-desc .house-desc-item li:nth-child(6) span:nth-child(2)::text').get().strip() desc = response.css( '.house-detail-desc .main-detail-info .house-word-introduce .introduce-item li .a2::text').getall() description = ''.join(desc).strip() url = response.xpath('/html/head/link[1]/@href').get() # print('0' * 100) # print(url) # print('0' * 100) item = RentItem(title=title, price=price, rent_style=rent_style, house_style=house_style, house_deco=house_deco, house_area=house_area, house_location=house_location, address=address, description=description,url = url) yield item
import scrapy from scrapy.spiders.crawl import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from ..items import RentItem class RentspiderSpider(CrawlSpider): name = 'rentspider' allowed_domains = [ 'hz.58.com' ] start_urls = [ 'https://hz.58.com/caihongcheng/chuzu/pn1' ] rules = ( Rule(LinkExtractor(allow = 'https://hz.58.com/caihongcheng/chuzu/pn\d+.*' ), follow = True ), # Rule(LinkExtractor(allow= r'/hezu/*.shtml?houseId=\d+.*',restrict_xpaths = ['//ul[@class="house-list"]//a']),follow=False,callback='pars_hezu'), # Rule(LinkExtractor(allow= r'/hezu/\d+x.shtml?houseId=\d+.*'),follow=False,callback='parse_hezu'), # Rule(LinkExtractor(allow=r'/hezu/50814053918142x.shtml?houseId=2582157217996813.*'), follow=False,callback='parse_hezu'), # Rule( # LinkExtractor(allow=r'/zufang/\d+x.shtml\?houseId=\d+.*', restrict_xpaths=['//ul[@class="house-list"]//a']), # follow=False, callback='parse_zufang'), # Rule(LinkExtractor(allow=r'/hezu/\d+x.shtml\?houseId=\d+.*', restrict_xpaths=['//ul[@class="house-list"]//a']), # follow=False, callback='parse_zufang', ), Rule(LinkExtractor(allow = r '/zufang/\d+x.shtml\?houseId=\d+.*' ), follow = False , callback = 'parse_zufang' ), Rule(LinkExtractor(allow = r '/hezu/\d+x.shtml\?houseId=\d+.*' ), follow = False , callback = 'parse_zufang' , ), # : // hz.58.com / hezu / 50659210723596x.shtml?houseId=2562337288563716 & shangquan=caihongcheng & shangquanId=13477 & dataSource=2 & tid=7b8da1d2-4d4c-401c-a6a6-6bbde41696e5 & legourl= // legoclick.58.com / jump?target=szqCXB3draOWUvYfpARlIitdnjmdrHcQnj0zn1NOPM980v6YUykKuaYYPyR6nH9vnzY3uh7bsHwhmymVrHTYnzd6uhm1uAR-PHuhnW0KP1bYnHnYrjm3PWn3P19KTHcdPWc1n10zrj9dPWnLnHmKnW9QnjTKnW9QnjTKnikQn9DLrik3PikQn1ELPkDQPWNvrHbOnjD1njnzTHDKwbnVPHWM_oeG9cbVNDnVOuacOlXxlppylpAdTHDKnE76UAQxuARhmgR_I7qWIZFtniQhuy7YIgF-gvF60vRxmyQ_gvw-uh7dUZwtnEDVnEDKnHNYPjD1rjbvnWckn1bknHbdnkDvTEDQTywBnjbkPj--syEkujTVPjcdPzd6PywWsHDLPW6hmvu6P19LuEDQPHEYnHn3rHmzn1DzrHEQPHm3THDdPjEQn19OPWczrjndrjDYPjcKTHD_nHcKTHT8nHEYrjbkrjcKTiYKTEDKubP7UdRVid7EPvRDHR6zRYVZnARJNHYqTHDQPi3QrHN8nHnvsWn1THTKnTDKnikQn97exEDQnjT1nkDQnjTznWTdTHIBrAw6nyEzsHwbPAnVPjTQmzd6PhDvsHuBmhw-PjDvrHu-PEDKnTDKTHTKP1b_rjN_nHnYP10KnE78IyQ_THE3m1Tznj76uWEYrAEQrHD # : // hz.58.com / zufang / 50541694381583x.shtml?houseId=2547295196438538 & shangquan=caihongcheng & shangquanId=13477 & dataSource=0 & iuType=j_1 & PGTID=0d3090a7-034a-51cf-9afe-2277515ef0f7 & ClickID=2 # : // hz.58.com / zufang / 49422523381159x.shtml?houseId=2404041308351491 & shangquan=caihongcheng & shangquanId=13477 & dataSource=0 & iuType=j_1 & PGTID=0d3090a7-034a-53b8-d4a6-8c2f7d365374 & ClickID=15 ) def parse_hezu( self , response): print ( '*' * 200 ) print (response.text) return def parse_zufang( self , response): print ( '*' * 200 ) title = response.css( '.house-title h1::text' ).get() price = response.css( '.house-basic-info .house-basic-right .strongbox::text' ).get().strip() rent_style = response.css( '.house-basic-info .house-basic-right .house-basic-desc .house-desc-item li span:nth-child(2)::text' ).get() house_style_list = response.css( '.house-basic-info .house-basic-right .house-basic-desc .house-desc-item li:nth-child(2) span:nth-child(2)::text' ).get() # print(type(house_style_list)) house_style = house_style_list.split()[ 0 ] house_deco = house_style_list.split()[ 3 ] house_area = house_style_list.split()[ 1 ] + house_style_list.split()[ 2 ] house_location = response.css( '.house-basic-info .house-basic-right .house-basic-desc .house-desc-item li:nth-child(4) span:nth-child(2) a::text' ).get().strip() address = response.css( '.house-basic-info .house-basic-right .house-basic-desc .house-desc-item li:nth-child(6) span:nth-child(2)::text' ).get().strip() desc = response.css( '.house-detail-desc .main-detail-info .house-word-introduce .introduce-item li .a2::text' ).getall() description = ''.join(desc).strip() url = response.xpath( '/html/head/link[1]/@href' ).get() # print('0' * 100) # print(url) # print('0' * 100) item = RentItem(title = title, price = price, rent_style = rent_style, house_style = house_style, house_deco = house_deco, house_area = house_area, house_location = house_location, address = address, description = description,url = url) yield item
标签:style,python,house,hezu,basic,response,desc From: https://www.cnblogs.com/Jeffreywuyu/p/16887588.html