python爬取数据

from bs4 import BeautifulSoup

import requests

import time


url_saves = 'http://www.tripadvisor.com/Saves#37685322'

url = 'https://cn.tripadvisor.com/Attractions-g60763-Activities-New_York_City_New_York.html'

urls = ['https://cn.tripadvisor.com/Attractions-g60763-Activities-oa{}-New_York_City_New_York.html#ATTRACTION_LIST'.format(str(i)) for i in range(30,930,30)]


# 模拟登入

headers = {

    'User-Agent':'',

    'Cookie':''

}



def get_attractions(url,data=None):

    wb_data = requests.get(url)

    # 防止请求太快IP被限制

    time.sleep(4)

    soup = BeautifulSoup(wb_data.text,'lxml')

    titles    = soup.select('div.property_title > a[target="_blank"]')

    imgs      = soup.select('img[width="160"]')

    cates     = soup.select('div.p13n_reasoning_v2')


    if data == None:

        for title,img,cate in zip(titles,imgs,cates):

            data = {

                'title'  :title.get_text(),

                'img'    :img.get('src'),

                'cate'   :list(cate.stripped_strings),

                }

        print(data)



def get_favs(url,data=None):

    wb_data = requests.get(url,headers=headers)

    soup      = BeautifulSoup(wb_data.text,'lxml')

    titles    = soup.select('a.location-name')

    imgs      = soup.select('div.photo > div.sizedThumb > img.photo_image')

    metas = soup.select('span.format_address')


    if data == None:

        for title,img,meta in zip(titles,imgs,metas):

            data = {

                'title'  :title.get_text(),

                'img'    :img.get('src'),

                'meta'   :list(meta.stripped_strings)

            }

            print(data)


for single_url in urls:

    get_attractions(single_url)



# 由于img反爬虫(该网站其实就是js控制图片链接),所以利用模拟mobile方式打开,因为移动端反爬虫不严密

'''

headers = {

    'User-Agent':'', #mobile device user agent from chrome

}



mb_data = requests.get(url,headers=headers)

soup = BeautifulSoup(mb_data.text,'lxml')

imgs = soup.select('div.thumb.thumbLLR.soThumb > img')

for i in imgs:

    print(i.get('src'))

'''


以上是 python爬取数据 的全部内容, 来源链接: www.h5w3.com/116143.html

回到顶部