编程语言
991
# 爬取有关软件工作的信息
import requests from pymysql import connect from bs4 import BeautifulSoup # 定义数据库的连接函数 conn = connect(user="root", password="root", host="localhost", database="python", charset="utf8") cursor = conn.cursor() # 获取工作数据 def get_html_resources(url, headers): response = requests.get(url, headers=headers) if response.status_code == 200: return response.text else: print("获取网站源码出错........") # 解析网站源码 def parse_detail_page(html, headers, table, cursor): parttern = re.compile('engine_search_result":(.*?)</script>', re.S) items = re.findall(parttern, html) parttern_detail = re.compile( 'job_href":"(.*?)","job_name":"(.*?)".*?company_name":"(.*?)","providesalary_text":"(.*?)".*?attribute_text":(.*?),"companysize_text', re.S) items_detail = re.findall(parttern_detail, items[0]) for item in items_detail: address = [] education = [] content = [] job_url = str(item[0]).replace("\\", "") job_address_education = str(item[4]).replace('["', "").replace('"]', "").replace('"', "").split(",") if len(job_address_education) == 4: address.append(job_address_education[0]) education.append(job_address_education[2]) if len(job_address_education) == 3: address.append(job_address_education[0]) education.append(job_address_education[1]) # 开始获取详情页的工作数据 response = requests.get(job_url, headers=headers) response.encoding = "gbk" try: if response.status_code == 200: detail_html = response.text soup = BeautifulSoup(detail_html, "lxml") job_request = soup.find("div", class_="bmsg job_msg inbox").text content.append(job_request) else: print("获取详情页的信息错误") pass except: pass yield { "工作名称": item[1], "公司名称": item[2], "工作待遇": item[3], "工作地点": address[0], "学历要求": education[0], "工作要求": content[0], } try: sql = "insert into " + str( table) + "(job_name,company_name,salary,job_address,education,job_require) values ('" + item[ 1] + "','" + item[2] + "','" + item[3] + "','" + address[0] + "','" + education[0] + "','" + \ content[0] + "');" cursor.execute(sql) conn.commit() except: print("数据插入异常............") conn.rollback() return items def main(): # 创建headers信息 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36" } # 创建数据表列表 table_list = ["android", "nature_language", "deep_learning", "computer_vision", "big_data", "machine_learning", "production_manager", "education", "finance", "service", "transportation"] job_name_list = ["android开发", "自然语言处理", "深度学习", "计算机视觉", "大数据", "机器学习", "产品经理", "教师", "金融", "服务类", "运输"] # 创建数据库的sql语句 # 定义sql语句用来创建表结构 for table in table_list: try: sql = "create table " + str( table) + "(job_name varchar(200),company_name varchar(300),salary varchar(300),job_address varchar(500),education varchar(100),job_require varchar(5000),min_salary int(11) null,max_salary int(11));" # 使用cursor执行sql语句进行表结构的创建 cursor.execute(sql) conn.commit() except: print("数据表已存在正......................") pass finally: # 设置关键字开始抓取工作数据 job = job_name_list[table_list.index(table)] for i in range(2): url = "https://search.51job.com/list/000000,000000,0000,00,9,99," + str(job) + ",2," + str( i + 1) + ".html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=" try: html = get_html_resources(url, headers) items = parse_detail_page(html, headers, table, cursor) for item in items: print(item) except: print(url) print("获取异常") pass if __name__ == "__main__": main()