Unverified Commit 186ba3ba authored by dailidong's avatar dailidong Committed by GitHub
Browse files

fix oom when no master is active in dev-1.3.0 (#2918)

* fix worker group config no effect

* remove codehaus janino jar
the license about janino maybe not compatiable with Apache v2

* Merge remote-tracking branch 'upstream/dev-1.3.0' into dev-1.3.0

# Conflicts:
#	dolphinscheduler-server/src/main/resources/config/install_config.conf

* datasource config

* Update datasource.properties

* fix RunConfig bug

* remove param monitor server state

* fix table T_DS_ALERT

* update h2 database

* fix #2910 master server will show exception for some time when it restart

* fix oom when no master is active

* fix worker oom when master server restart

* fix oom

* fix

* add UT

* fix worker group config no effect
parent 422fe513
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -349,7 +349,7 @@ public class NettyRemotingClient {
                return channel;
            }
        } catch (Exception ex) {
            logger.info("connect to {} error  {}", host, ex);
            logger.warn(String.format("connect to %s error", host), ex);
        }
        return null;
    }
+1 −1
Original line number Diff line number Diff line
@@ -87,7 +87,7 @@ public class LowerWeightHostManager extends CommonHostManager {
        this.workerHostWeightsMap = new ConcurrentHashMap<>();
        this.lock = new ReentrantLock();
        this.executorService = Executors.newSingleThreadScheduledExecutor(new NamedThreadFactory("LowerWeightHostManagerExecutor"));
        this.executorService.scheduleWithFixedDelay(new RefreshResourceTask(),0, 40, TimeUnit.SECONDS);
        this.executorService.scheduleWithFixedDelay(new RefreshResourceTask(),0, 5, TimeUnit.SECONDS);
        this.roundRobinHostManager = new RoundRobinHostManager();
        this.roundRobinHostManager.setZookeeperNodeManager(getZookeeperNodeManager());
    }
+16 −3
Original line number Diff line number Diff line
@@ -46,6 +46,7 @@ import static org.apache.dolphinscheduler.common.Constants.SLEEP_TIME_MILLIS;
public class TaskCallbackService {

    private final Logger logger = LoggerFactory.getLogger(TaskCallbackService.class);
    private static final int [] RETRY_BACKOFF = { 1, 2, 3, 5, 10, 20, 40, 100, 100, 100, 100, 200, 200, 200 };

    /**
     *  remote channels
@@ -58,6 +59,7 @@ public class TaskCallbackService {
    @Autowired
    private ZookeeperRegistryCenter zookeeperRegistryCenter;


    /**
     * netty remoting client
     */
@@ -99,14 +101,19 @@ public class TaskCallbackService {
                nettyRemoteChannel.getHost(),
                taskInstanceId);
        Set<String> masterNodes = null;
        int ntries = 0;
        while (Stopper.isRunning()) {
            masterNodes = zookeeperRegistryCenter.getMasterNodesDirectly();
            if (CollectionUtils.isEmpty(masterNodes)) {
                logger.info("try {} times but not find any master for task : {}.",
                        ntries + 1,
                        taskInstanceId);
                masterNodes = null;
                ThreadUtils.sleep(SLEEP_TIME_MILLIS);
                ThreadUtils.sleep(pause(ntries++));
                continue;
            }
            logger.info("find {} masters for task : {}.",
            logger.info("try {} times to find {} masters for task : {}.",
                    ntries + 1,
                    masterNodes.size(),
                    taskInstanceId);
            for (String masterNode : masterNodes) {
@@ -116,12 +123,18 @@ public class TaskCallbackService {
                }
            }
            masterNodes = null;
            ThreadUtils.sleep(SLEEP_TIME_MILLIS);
            ThreadUtils.sleep(pause(ntries++));
        }

        throw new IllegalStateException(String.format("all available master nodes : %s are not reachable for task: {}", masterNodes, taskInstanceId));
    }


    public int pause(int ntries){
        return SLEEP_TIME_MILLIS * RETRY_BACKOFF[ntries % RETRY_BACKOFF.length];
    }


    private NettyRemoteChannel getRemoteChannel(Channel newChannel, long opaque, int taskInstanceId){
        NettyRemoteChannel remoteChannel = new NettyRemoteChannel(newChannel, opaque);
        addRemoteChannel(taskInstanceId, remoteChannel);
+15 −19
Original line number Diff line number Diff line
@@ -39,6 +39,7 @@ import org.apache.dolphinscheduler.server.zk.SpringZKServer;
import org.apache.dolphinscheduler.service.bean.SpringApplicationContext;
import org.apache.dolphinscheduler.service.zk.ZookeeperCachedOperator;
import org.apache.dolphinscheduler.service.zk.ZookeeperConfig;
import org.junit.Assert;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.mockito.Mockito;
@@ -46,6 +47,7 @@ import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;

import java.io.IOException;
import java.util.Date;

/**
@@ -91,12 +93,8 @@ public class TaskCallbackServiceTest {
        ackCommand.setStartTime(new Date());
        taskCallbackService.sendAck(1, ackCommand.convert2Command());

        Thread.sleep(5000);

        Stopper.stop();

        Thread.sleep(5000);

        nettyRemotingServer.close();
        nettyRemotingClient.close();
    }
@@ -140,8 +138,13 @@ public class TaskCallbackServiceTest {
        Stopper.stop();
    }

    @Test(expected = IllegalStateException.class)
    public void testSendAckWithIllegalStateException1(){
    @Test
    public void testPause(){
        Assert.assertEquals(5000, taskCallbackService.pause(3));;
    }

    @Test
    public void testSendAck1(){
        masterRegistry.registry();
        final NettyServerConfig serverConfig = new NettyServerConfig();
        serverConfig.setListenPort(30000);
@@ -153,27 +156,20 @@ public class TaskCallbackServiceTest {
        NettyRemotingClient nettyRemotingClient = new NettyRemotingClient(clientConfig);
        Channel channel = nettyRemotingClient.getChannel(Host.of("localhost:30000"));
        taskCallbackService.addRemoteChannel(1, new NettyRemoteChannel(channel, 1));
        channel.close();
//        channel.close();

        TaskExecuteAckCommand ackCommand = new TaskExecuteAckCommand();
        ackCommand.setTaskInstanceId(1);
        ackCommand.setStartTime(new Date());

        nettyRemotingServer.close();

        taskCallbackService.sendAck(1, ackCommand.convert2Command());
        try {
            Thread.sleep(5000);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }

        Assert.assertEquals(true, channel.isOpen());

        Stopper.stop();

        try {
            Thread.sleep(5000);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
        nettyRemotingServer.close();
        nettyRemotingClient.close();
    }

//    @Test(expected = IllegalStateException.class)
+3 −2
Original line number Diff line number Diff line
@@ -95,8 +95,9 @@ public class ZKServer {
     * @param port The port to listen on
     */
    public static void startLocalZkServer(final int port) {

        startLocalZkServer(port, System.getProperty("user.dir") +"/zookeeper_data", ZooKeeperServer.DEFAULT_TICK_TIME,"20");
        String zkDataDir = System.getProperty("user.dir") +"/zookeeper_data";
        logger.info("zk server starting, data dir path:{}" , zkDataDir);
        startLocalZkServer(port, zkDataDir, ZooKeeperServer.DEFAULT_TICK_TIME,"60");
    }

    /**
Loading