Commit 21840934 authored by jinhai's avatar jinhai
Browse files

Merge branch 'branch-0.5.0-yk' into 'branch-0.5.0'

MS-637 - out of memory when load too many tasks

See merge request megasearch/milvus!692

Former-commit-id: 1ac75d2594d27f78ff594622a25db0ad6639efb1
parents d16c93e6 16bf99a9
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -13,6 +13,7 @@ Please mark all change in change log and use the ticket from JIRA.
- MS-601 - Docker logs error caused by get CPUTemperature error
- MS-622 - Delete vectors should be failed if date range is invalid
- MS-620 - Get table row counts display wrong error code
- MS-637 - out of memory when load too many tasks

## Improvement
- MS-552 - Add and change the easylogging library
+13 −26
Original line number Diff line number Diff line
@@ -49,40 +49,27 @@ load_simple_config() {
    std::vector<std::string> pool;
    config.GetResourceConfigPool(pool);

    bool cpu = false;
    std::set<uint64_t> gpu_ids;
    // get resources
    bool use_cpu_to_compute = false;
    for (auto& resource : pool) {
        if (resource == "cpu") {
            cpu = true;
            use_cpu_to_compute = true;
            break;
        } else {
            if (resource.length() < 4 || resource.substr(0, 3) != "gpu") {
                // error
                exit(-1);
            }
            auto gpu_id = std::stoi(resource.substr(3));
            if (gpu_id >= get_num_gpu()) {
                // error
                exit(-1);
            }
            gpu_ids.insert(gpu_id);
        }
    }
    auto gpu_ids = get_gpu_pool();

    // create and connect
    ResMgrInst::GetInstance()->Add(ResourceFactory::Create("disk", "DISK", 0, true, false));

    auto io = Connection("io", 500);
    if (cpu) {
        ResMgrInst::GetInstance()->Add(ResourceFactory::Create("cpu", "CPU", 0, true, true));
        ResMgrInst::GetInstance()->Connect("disk", "cpu", io);
    } else {
        ResMgrInst::GetInstance()->Add(ResourceFactory::Create("cpu", "CPU", 0, true, false));
    ResMgrInst::GetInstance()->Add(ResourceFactory::Create("cpu", "CPU", 0, true, use_cpu_to_compute));
    ResMgrInst::GetInstance()->Connect("disk", "cpu", io);

    auto pcie = Connection("pcie", 12000);
    for (auto& gpu_id : gpu_ids) {
        ResMgrInst::GetInstance()->Add(ResourceFactory::Create(std::to_string(gpu_id), "GPU", gpu_id, true, true));
            ResMgrInst::GetInstance()->Connect("cpu", std::to_string(gpu_id), io);
        }
        ResMgrInst::GetInstance()->Connect("cpu", std::to_string(gpu_id), pcie);
    }
}

+7 −0
Original line number Diff line number Diff line
@@ -110,11 +110,15 @@ Scheduler::OnLoadCompleted(const EventPtr& event) {
                break;
            }
            case TaskLabelType::BROADCAST: {
                if (resource->HasExecutor() == false) {
                    load_completed_event->task_table_item_->Move();
                }
                Action::PushTaskToAllNeighbour(load_completed_event->task_table_item_->task, resource);
                break;
            }
            default: { break; }
        }
        resource->WakeupLoader();
    }
}

@@ -127,6 +131,9 @@ Scheduler::OnStartUp(const EventPtr& event) {

void
Scheduler::OnFinishTask(const EventPtr& event) {
    if (auto resource = event->resource_.lock()) {
        resource->WakeupLoader();
    }
}

void
+13 −0
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@
#include "scheduler/TaskTable.h"
#include "Utils.h"
#include "event/TaskTableUpdatedEvent.h"
#include "utils/Log.h"

#include <ctime>
#include <sstream>
@@ -157,6 +158,18 @@ TaskTableItem::Dump() {

std::vector<uint64_t>
TaskTable::PickToLoad(uint64_t limit) {
    size_t count = 0;
    for (int j = last_finish_ + 1; j < table_.size(); ++j) {
        if (not table_[j]) {
            SERVER_LOG_WARNING << "table[" << j << "] is nullptr";
        }
        if (table_[j]->state == TaskTableItemState::LOADED) {
            ++count;
            if (count > 2)
                return std::vector<uint64_t>();
        }
    }

    std::vector<uint64_t> indexes;
    bool cross = false;
    for (uint64_t i = last_finish_ + 1, count = 0; i < table_.size() && count < limit; ++i) {
+1 −1
Original line number Diff line number Diff line
@@ -46,7 +46,7 @@ TEST(NormalTest, INST_TEST) {
    res_mgr->Start();
    scheduler->Start();

    const uint64_t NUM_TASK = 1000;
    const uint64_t NUM_TASK = 2;
    std::vector<std::shared_ptr<ms::TestTask>> tasks;
    ms::TableFileSchemaPtr dummy = nullptr;

Loading