爬取软件工程师相关信息

编程语言
991
爬取软件工程师相关信息
# 爬取有关软件工作的信息

import requests
from pymysql import connect
from bs4 import BeautifulSoup

# 定义数据库的连接函数
conn = connect(user="root", password="root", host="localhost", database="python", charset="utf8")
cursor = conn.cursor()


# 获取工作数据
def get_html_resources(url, headers):
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.text
    else:
        print("获取网站源码出错........")
# 解析网站源码
def parse_detail_page(html, headers, table, cursor):
    parttern = re.compile('engine_search_result":(.*?)</script>', re.S)
    items = re.findall(parttern, html)
    parttern_detail = re.compile(
        'job_href":"(.*?)","job_name":"(.*?)".*?company_name":"(.*?)","providesalary_text":"(.*?)".*?attribute_text":(.*?),"companysize_text',
        re.S)
    items_detail = re.findall(parttern_detail, items[0])
    for item in items_detail:
        address = []
        education = []
        content = []
        job_url = str(item[0]).replace("\\", "")
        job_address_education = str(item[4]).replace('["', "").replace('"]', "").replace('"', "").split(",")
        if len(job_address_education) == 4:
            address.append(job_address_education[0])
            education.append(job_address_education[2])
        if len(job_address_education) == 3:
            address.append(job_address_education[0])
            education.append(job_address_education[1])

        # 开始获取详情页的工作数据
        response = requests.get(job_url, headers=headers)
        response.encoding = "gbk"
        try:
            if response.status_code == 200:
                detail_html = response.text
                soup = BeautifulSoup(detail_html, "lxml")
                job_request = soup.find("div", class_="bmsg job_msg inbox").text
                content.append(job_request)
            else:
                print("获取详情页的信息错误")
                pass
        except:
            pass

        yield {
            "工作名称": item[1],
            "公司名称": item[2],
            "工作待遇": item[3],
            "工作地点": address[0],
            "学历要求": education[0],
            "工作要求": content[0],
        }
        try:
            sql = "insert into " + str(
                table) + "(job_name,company_name,salary,job_address,education,job_require) values ('" + item[
                      1] + "','" + item[2] + "','" + item[3] + "','" + address[0] + "','" + education[0] + "','" + \
                  content[0] + "');"
            cursor.execute(sql)
            conn.commit()
        except:
            print("数据插入异常............")
            conn.rollback()

    return items


def main():
    # 创建headers信息
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"
    }

    # 创建数据表列表
    table_list = ["android", "nature_language", "deep_learning", "computer_vision", "big_data", "machine_learning",
                  "production_manager", "education", "finance", "service", "transportation"]
    job_name_list = ["android开发", "自然语言处理", "深度学习", "计算机视觉", "大数据", "机器学习", "产品经理", "教师", "金融", "服务类", "运输"]
    # 创建数据库的sql语句
    # 定义sql语句用来创建表结构
    for table in table_list:
        try:
            sql = "create table " + str(
                table) + "(job_name varchar(200),company_name varchar(300),salary varchar(300),job_address varchar(500),education varchar(100),job_require varchar(5000),min_salary int(11) null,max_salary int(11));"
            # 使用cursor执行sql语句进行表结构的创建
            cursor.execute(sql)
            conn.commit()

        except:
            print("数据表已存在正......................")
            pass

        finally:
            # 设置关键字开始抓取工作数据
            job = job_name_list[table_list.index(table)]
            for i in range(2):
                url = "https://search.51job.com/list/000000,000000,0000,00,9,99," + str(job) + ",2," + str(
                    i + 1) + ".html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="
                try:
                    html = get_html_resources(url, headers)
                    items = parse_detail_page(html, headers, table, cursor)
                    for item in items:
                        print(item)
                except:
                    print(url)
                    print("获取异常")
                    pass


if __name__ == "__main__":
    main()
正经战队
Exia，驱逐目标！
发布数
关注者
1832
累计阅读
爬取软件工程师相关信息 4年前

热门教程文档