Unverified Commit 1d0ca07b authored by lgcareer's avatar lgcareer Committed by GitHub
Browse files

Merge pull request #334 from lgcareer/branch-1.0.2

add monitorServerState in install.sh
parents b304f154 d4011779
Loading
Loading
Loading
Loading
+30 −0
Original line number Diff line number Diff line
@@ -98,6 +98,9 @@ xlsFilePath="/tmp/xls"
# 不启动设置为false,如果为false,以下配置不需要修改
hdfsStartupSate="false"

#是否启动监控自启动脚本
monitorServerState="false"

# namenode地址,支持HA,需要将core-site.xml和hdfs-site.xml放到conf目录下
namenodeFs="hdfs://mycluster:8020"

@@ -364,3 +367,30 @@ fi
# 6,启动
echo "6,启动"
sh ${workDir}/script/start_all.sh

# 7启动监控自启动脚本
monitor_pid=${workDir}/monitor_server.pid
if [ "true" = $monitorServerState ];then
        if [ -f $monitor_pid ]; then
                TARGET_PID=`cat $monitor_pid`
                if kill -0 $TARGET_PID > /dev/null 2>&1; then
                        echo "monitor server running as process ${TARGET_PID}.Stopping"
                        kill $TARGET_PID
                        sleep 5
                        if kill -0 $TARGET_PID > /dev/null 2>&1; then
                                echo "monitor server did not stop gracefully after 5 seconds: killing with kill -9"
                                kill -9 $TARGET_PID
                        fi
                else
                        echo "no monitor server to stop"
                fi
                echo "monitor server running as process ${TARGET_PID}.Stopped success"
                rm -f $monitor_pid
        fi
        nohup python -u ${workDir}/script/monitor_server.py $installPath $zkQuorum $zkMasters $zkWorkers > ${workDir}/monitor_server
.log 2>&1 &
        echo $! > $monitor_pid
        echo "start monitor server success as process `cat $monitor_pid`"

fi
+26 −14
Original line number Diff line number Diff line
@@ -9,10 +9,15 @@ yum -y install python-pip
pip install kazoo 安装
conda install -c conda-forge kazoo 安装

运行脚本:
nohup python -u monitor_server.py > nohup.out 2>&1 &
运行脚本及参数说明:
nohup python -u monitor_server.py /data1_1T/escheduler 192.168.xx.xx:2181,192.168.xx.xx:2181,192.168.xx.xx:2181 /escheduler/masters /escheduler/workers> monitor_server.log 2>&1 &
参数说明如下:
/data1_1T/escheduler的值来自install.sh中的installPath
192.168.xx.xx:2181,192.168.xx.xx:2181,192.168.xx.xx:2181的值来自install.sh中的zkQuorum
/escheduler/masters的值来自install.sh中的zkMasters
/escheduler/workers的值来自install.sh中的zkWorkers
'''

import sys
import socket
import os
import sched
@@ -20,13 +25,12 @@ import time
from datetime import datetime
from kazoo.client import KazooClient


schedule = sched.scheduler(time.time, time.sleep)

class ZkClient:
    def __init__(self):
        # hosts配置zk地址集群
        self.zk = KazooClient(hosts='ark0:2181,ark1:2181,ark2:2181')
	    self.zk = KazooClient(hosts=zookeepers)
	    self.zk.start()

    # 读取配置文件,组装成字典
@@ -45,35 +49,37 @@ class ZkClient:

    # 重启服务
    def restart_server(self,inc):
        config_dict = self.read_file('/data1_1T/escheduler/conf/config/run_config.conf')
        config_dict = self.read_file(install_path + '/conf/config/run_config.conf')

        master_list = config_dict.get('masters').split(',')
        print master_list
        master_list = list(map(lambda item : self.get_ip_by_hostname(item),master_list))

        worker_list = config_dict.get('workers').split(',')
	print worker_list
        worker_list = list(map(lambda item: self.get_ip_by_hostname(item), worker_list))

        if (self.zk.exists('/escheduler/masters')):
        if (self.zk.exists(masters_zk_path)):
            zk_master_list = []
            zk_master_nodes = self.zk.get_children('/escheduler/masters')
            zk_master_nodes = self.zk.get_children(masters_zk_path)
            for zk_master_node in zk_master_nodes:
                zk_master_list.append(zk_master_node.split('_')[0])
            restart_master_list = list(set(master_list) - set(zk_master_list))
            if (len(restart_master_list) != 0):
                for master in restart_master_list:
                    print("master " + self.get_ip_by_hostname(master) + " 服务已经掉了")
                    os.system('ssh ' + self.get_ip_by_hostname(master) + ' sh /data1_1T/escheduler/bin/escheduler-daemon.sh start master-server')
                    os.system('ssh ' + self.get_ip_by_hostname(master) + ' sh ' + install_path + '/bin/escheduler-daemon.sh start master-server')

        if (self.zk.exists('/escheduler/workers')):
        if (self.zk.exists(workers_zk_path)):
            zk_worker_list = []
            zk_worker_nodes = self.zk.get_children('/escheduler/workers')
            zk_worker_nodes = self.zk.get_children(workers_zk_path)
            for zk_worker_node in zk_worker_nodes:
                zk_worker_list.append(zk_worker_node.split('_')[0])
            restart_worker_list = list(set(worker_list) - set(zk_worker_list))
            if (len(restart_worker_list) != 0):
                for worker in restart_worker_list:
                    print("worker " + self.get_ip_by_hostname(worker) + " 服务已经掉了")
                    os.system('ssh  ' + self.get_ip_by_hostname(worker) + ' sh /data1_1T/escheduler/bin/escheduler-daemon.sh start worker-server')
                    os.system('ssh  ' + self.get_ip_by_hostname(worker) + ' sh ' + install_path + '/bin/escheduler-daemon.sh start worker-server')

        print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
        schedule.enter(inc, 0, self.restart_server, (inc,))
@@ -84,5 +90,11 @@ class ZkClient:
        schedule.enter(0, 0, self.restart_server, (inc,))
        schedule.run()
if __name__ == '__main__':
    if (len(sys.argv) < 4):
        print('please input install_path,zookeepers,masters_zk_path and worker_zk_path')
    install_path = sys.argv[1]
    zookeepers = sys.argv[2]
    masters_zk_path = sys.argv[3]
    workers_zk_path = sys.argv[4]
    zkClient = ZkClient()
    zkClient.main(300)