爬虫案例(九)

1.创建线程的第一种方式

import random
import  threading
import time


def down(file):
    print(file,'任务1开始下载')
    time.sleep(random.randint(0,3))
    print(file,'任务2开始下载')


if __name__ == '__main__':
       #创建多线程
        for i in range(3):
            t = threading.Thread(target=down,args=(i,))
            t.start()

2.创建线程的第二种方式
import random
import threading
import time

class Mythread(threading.Thread):
def run(self):
for i in range(1,5):
time.sleep(random.randint(0,3))
print(‘下载中~~~’)
if name == ‘main’:
#创建多线程
t =Mythread()
t.start()

3.传参方式

import  threading

class Mythread(threading.Thread):
    def __init__(self,fliename):
        super().__init__()
        # threading.Thread.__init__(self)

        self.fliename =fliename

    def run(self):
        print(self.fliename,'正在下载')

if __name__ == '__main__':
    t =Mythread('pachong.py')
    t.start()

4.查看线程的名称

import threading

class My_Thread(threading.Thread):
    # 覆写run方法
    def run(self):
        # name属性可以查看线程名,默认名称为Thead-n,n是一个数字
        # print('{}正在运行'.format(self.name))
        print('{}正在运行'.format(threading.current_thread()))
        pass
    pass

if __name__ == '__main__':
    for i in range(3):
        # 创建线程
        # 可以传入name参数对线程名进行修改
        t = My_Thread(name='download{}'.format(i))
        # 启动线程
        t.start()
        pass

5.线程之间共享全局变量

import threading
# 加锁必须要释放

# 创建锁
lock = threading.Lock()

# 定义初始值
value = 0

def add_value():
    global value
    # 循环100次,没问题
    # 循环1000000次,发现value值出现问题
    # 加锁
    # 注意:如果加锁后未被释放,会造成死锁
    lock.acquire()
    for i in range(1000000):
        value+=1
    # 释放锁
    lock.release()
    print('value的值是',value)
if __name__ == '__main__':
    # 创建多线程
    for i in range(2):
        # 创建线程
        t = threading.Thread(target=add_value)
        # 启动线程
        t.start()
        ...

6.生产者和消费者模式

import threading,random


import time
# 定义银行存款
g_money = 1000
# 定义锁
lock = threading.Lock()

# 定义生产钱的次数
g_times = 0

# 定义生产者类
class Product(threading.Thread):
    # 覆写run方法
    def run(self):
        global g_money
        global g_times
        # 为了保证一直生产,需要写一个死循环
        while True:
            # 加锁
            lock.acquire()
            if g_times > 10:
                # 释放锁
                lock.release()
                break
            # 生产钱
            money = random.randint(100,1000)
            # 存钱
            g_money += money
            print('{}生产了{}钱'.format(threading.current_thread(),money))
            g_times += 1
            # 释放锁
            lock.release()
            time.sleep(1)
            pass
        pass
    pass

# 定义消费者类
class Comsumer(threading.Thread):
    # 覆写run方法
    def run(self):
        global g_money
        # 为了保证一直消费,写一个死循环
        while True:
            # 加锁
            lock.acquire()
            # 消费钱
            money = random.randint(100, 1000)
            if g_money >= money:
                # 花钱
                g_money -= money
                print('{}消费了{}钱,还剩{}钱'.format(threading.current_thread(),money,g_money))
            else:
                if g_times > 10:
                    # 释放锁
                    lock.release()
                    break
                print('{}准备消费{}钱,余额不足'.format(threading.current_thread(),money))
            # 释放锁
            lock.release()
            time.sleep(1)
        pass
    pass

if __name__ == '__main__':
    # 定义三个生产者
    for i in range(3):
        # 创建线程
        p = Product()
        # 启动线程
        p.start()

    # 定义五个消费者
    for i in range(5):
        # 创建线程
        c = Comsumer()
        # 启动线程
        c.start()

7.腾讯招聘

import requests

from selenium import webdriver
from lxml import etree
#调用谷歌游览器
driver = webdriver.Chrome(executable_path=r'C:UsersAdministratorDownloadschromedriver.exe')
#游览器最大化
driver.maximize_window()
#发送请求
#分析url
#第一页  https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1604126874662&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex=1&pageSize=10&language=zh-cn&area=cn
#第二页  https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1604126807045&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex=2&pageSize=10&language=zh-cn&area=cn
#第三页  https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1604126714145&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex=3&pageSize=10&language=zh-cn&area=cn
for i in range(1,51):
    driver.get(url='https://careers.tencent.com/search.html?index={}'.format(i))

    html = etree.HTML(driver.page_source)
    # print(html)
    #提取所有div
    div_list = html.xpath(r'//div[@class="recruit-wrap recruit-margin"]/div[@class="recruit-list"]')

    for data in div_list:
        # 获取标题
        title = data.xpath(r'./a[@class="recruit-list-link"]/h4/text()')[0]
        # print(title)
        # 获取地点
        address = data.xpath(r'./a[@class="recruit-list-link"]/p/span[2]/text()')[0]
        # 获取时间
        time = data.xpath(r'./a[@class="recruit-list-link"]/p/span[4]/text()')[0]

        print(title,address,time)
匿名

发表评论

匿名网友