1.kills.sh
#!/bin/sh NAME=$1 # $1 运行时输入参数 为文件名称 if [ -z "$NAME" ]; then echo "STRING is empty" NAME="aa" fi echo $NAME ID=`ps -ef | grep "$NAME" | grep -v "$0" | grep -v "grep" | awk '{print $2}'` echo $ID echo "---------------" for id in $ID do kill -9 $id echo "killed $id" done echo "---------------"
2.run_py.sh
#!/bin/sh NAME=$1 # $1 运行时输入参数 为文件名称 NAME=${NAME%%.*} if [ -z "$NAME" ]; then echo "STRING is empty" NAME="aa" fi echo $NAME ID=`ps -ef | grep "$NAME" | grep -v "$0" | grep -v "grep" | awk '{print $2}'` echo $ID echo "---------------" for id in $ID do kill -9 $id echo "killed $id" done echo "---------------" sleep 1 current_dir=$(cd $(dirname $0); pwd) echo $current_dir if [ ! -d "$current_dir/logs" ]; then echo "$current_dir/logs does not exist" `mkdir $current_dir/logs` fi echo "---------------" echo "nohup /usr/bin/python3.6 $current_dir/$NAME.py > $current_dir/logs/$NAME.log 2>&1 &" echo "---------------" echo "tail -f $current_dir/logs/$NAME.log" `nohup /usr/bin/python3.6 $current_dir/$NAME.py > $current_dir/logs/$NAME.log 2>&1 &` echo "启动成功"
3.run_zc.py
# -*- coding: UTF-8 -*- import logging import os import platform import subprocess import time logging.basicConfig( level=logging.INFO, # 定义输出到文件的log级别,大于此级别的都被输出 format='%(asctime)s %(filename)s %(levelname)s : %(message)s', # 定义输出log的格式 datefmt='%Y-%m-%d %H:%M:%S', # 时间 filename=os.path.splitext(os.path.basename(__file__))[0], # log文件名 filemode='a', # 写入模式“w”或“a” ) console = logging.StreamHandler() console.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s %(filename)s %(levelname)s : %(message)s') console.setFormatter(formatter) logging.getLogger('').addHandler(console) def run_main(): system = platform.system() path = os.path.abspath('.') if 'indo' in system: spider_path = '{}\yational_policy.py'.format(path) res = subprocess.call("python {}".format(spider_path), shell=True) else: spider_path = '{}/yational_policy.py'.format(path) res = subprocess.call("python3.6 {}".format(spider_path), shell=True) logger.info(spider_path) logger.info(res) if __name__ == '__main__': for i in range(100000000000): run_main() time.sleep(60) # nohup /usr/bin/python3.6 /home/pachong/yoyo/work/national_policy/national_policy/run_zc.py > /home/pachong/yoyo/work/national_policy/logs/run_zc.log 2>&1 &
4.多线程简单爬虫
import concurrent.futures import threading import requests import pymysql from yscredit_tools.MySQL import insert_update_data_mysql, select_data_mysql, insert_data_mysql from yscredit_tools.utils import clear_dict headers = { 'User-Agent': 'Apifox/1.0.0 (https://www.apifox.cn)', 'Accept': '*/*', 'Host': 'sqzc.gd.gov.cn', 'Connection': 'keep-alive' } db = pymysql.connect(host="10.1.3.29", port=3306, database="crawler_data_prd", user="root", password="root", charset='utf8', autocommit=True) cursor = db.cursor() lock = threading.RLock() def get_data(i): print(i) url = "https://sqzc.gd.gov.cn/sqzc/m/cms/policy/getPolicyListPage2?pageNumber={}&pageSize=10&keywords=&publisher=&city=".format(str(i)) response = requests.get(url, headers=headers) html = response.json() lock.acquire() for data in html["data"]: item = {} item["title"] = data["title"] item["publishDate"] = data["publishDate"] item["publisher"] = data["publisher"] item["city"] = data["city"] item["viewCount"] = data["viewCount"] item["place"] = data["place"] item["page"] = i # cur = select_data_mysql(dbhost="localhost", dbname="crawler_data_prd", tablename="policy", where='title = "{}"'.format(item["title"])) # if not cur.rowcount: # insert_update_data_mysql(dbhost="localhost", dbname="crawler_data_prd", tablename="policy", **clear_dict(item)) insert_data_mysql(dbhost="localhost", dbname="crawler_data_prd", tablename="policy", **clear_dict(item)) print(item["title"]) print("*" * 100) lock.release() with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor: future_to_url = executor.map(get_data, [i for i in range(1, 3078)]) # for i in range(1, 3078): # 3078 # print(i) # url = "https://sqzc.gd.gov.cn/sqzc/m/cms/policy/getPolicyListPage2?pageNumber={}&pageSize=10&keywords=&publisher=&city=".format(str(i)) # response = requests.get(url, headers=headers) # html = response.json() # for data in html["data"]: # item = {} # item["title"] = data["title"] # item["publishDate"] = data["publishDate"] # item["publisher"] = data["publisher"] # item["city"] = data["city"] # item["viewCount"] = data["viewCount"] # item["place"] = data["place"] # insert_update_data_mysql(dbhost="localhost", dbname="crawler_data_prd", tablename="policy", **clear_dict(item)) # print(item) # print("*" * 100)
标签:NAME,utils,echo,item,policy,import,data From: https://www.cnblogs.com/yoyo1216/p/17457324.html