Unverified Commit 52124d70 authored by Zhiru Zhu's avatar Zhiru Zhu Committed by GitHub
Browse files

Improve delete (#1474)



* remove build openblas/lapack and use find_package

* update ubuntu_build_deps.sh

* update build image

* update build image

* update CHANGELOG

* trigger ci

* update image

* update centos build envvironment image on Jenkins CI

* trigger ci

* add serialize benchmark

Signed-off-by: default avatarZhiru Zhu <zhiru.zhu@zilliz.com>

* update deleted doc codec

Signed-off-by: default avatarZhiru Zhu <zhiru.zhu@zilliz.com>

* update delete in mem

Signed-off-by: default avatarZhiru Zhu <zhiru.zhu@zilliz.com>

* update CHANGELOG

Signed-off-by: default avatarZhiru Zhu <zhiru.zhu@zilliz.com>

* don't apply delete when force flush

Signed-off-by: default avatarZhiru Zhu <zhiru.zhu@zilliz.com>

* lint

Signed-off-by: default avatarZhiru Zhu <zhiru.zhu@zilliz.com>
parent 758f15f2
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -41,6 +41,7 @@ Please mark all change in change log and use the issue from GitHub
- \#815 - Support MinIO storage
- \#823 - Support binary vector tanimoto/jaccard/hamming metric
- \#853 - Support HNSW
- \#861 - Support DeleteById / SearchByID / GetVectorById / Flush
- \#910 - Change Milvus c++ standard to c++17
- \#1122 - Support AVX-512 in FAISS
- \#1204 - Add api to get table data information
@@ -67,7 +68,7 @@ Please mark all change in change log and use the issue from GitHub
- \#1234 - Do S3 server validation check when Milvus startup
- \#1263 - Allow system conf modifiable and some take effect directly
- \#1320 - Remove debug logging from faiss

- \#1444 - Improve delete

## Task
- \#1327 - Exclude third-party code from codebeat
+35 −8
Original line number Diff line number Diff line
@@ -17,6 +17,9 @@

#include "codecs/default/DefaultDeletedDocsFormat.h"

#include <fcntl.h>
#include <unistd.h>

#include <boost/filesystem.hpp>
#include <memory>
#include <string>
@@ -35,8 +38,9 @@ DefaultDeletedDocsFormat::read(const store::DirectoryPtr& directory_ptr, segment

    std::string dir_path = directory_ptr->GetDirPath();
    const std::string del_file_path = dir_path + "/" + deleted_docs_filename_;
    FILE* del_file = fopen(del_file_path.c_str(), "rb");
    if (del_file == nullptr) {

    int del_fd = open(del_file_path.c_str(), O_RDONLY, 00664);
    if (del_fd == -1) {
        std::string err_msg = "Failed to open file: " + del_file_path;
        ENGINE_LOG_ERROR << err_msg;
        throw Exception(SERVER_CANNOT_CREATE_FILE, err_msg);
@@ -46,9 +50,20 @@ DefaultDeletedDocsFormat::read(const store::DirectoryPtr& directory_ptr, segment
    auto deleted_docs_size = file_size / sizeof(segment::offset_t);
    std::vector<segment::offset_t> deleted_docs_list;
    deleted_docs_list.resize(deleted_docs_size);
    fread((void*)(deleted_docs_list.data()), sizeof(segment::offset_t), deleted_docs_size, del_file);

    if (::read(del_fd, deleted_docs_list.data(), file_size) == -1) {
        std::string err_msg = "Failed to read from file: " + del_file_path + ", error: " + std::strerror(errno);
        ENGINE_LOG_ERROR << err_msg;
        throw Exception(SERVER_WRITE_ERROR, err_msg);
    }

    deleted_docs = std::make_shared<segment::DeletedDocs>(deleted_docs_list);
    fclose(del_file);

    if (::close(del_fd) == -1) {
        std::string err_msg = "Failed to close file: " + del_file_path + ", error: " + std::strerror(errno);
        ENGINE_LOG_ERROR << err_msg;
        throw Exception(SERVER_WRITE_ERROR, err_msg);
    }
}

void
@@ -57,16 +72,28 @@ DefaultDeletedDocsFormat::write(const store::DirectoryPtr& directory_ptr, const

    std::string dir_path = directory_ptr->GetDirPath();
    const std::string del_file_path = dir_path + "/" + deleted_docs_filename_;
    FILE* del_file = fopen(del_file_path.c_str(), "ab");  // TODO(zhiru): append mode
    if (del_file == nullptr) {

    // TODO(zhiru): append mode
    int del_fd = open(del_file_path.c_str(), O_WRONLY | O_APPEND | O_CREAT, 00664);
    if (del_fd == -1) {
        std::string err_msg = "Failed to open file: " + del_file_path;
        ENGINE_LOG_ERROR << err_msg;
        throw Exception(SERVER_CANNOT_CREATE_FILE, err_msg);
    }

    auto deleted_docs_list = deleted_docs->GetDeletedDocs();
    fwrite((void*)(deleted_docs_list.data()), sizeof(segment::offset_t), deleted_docs->GetSize(), del_file);
    fclose(del_file);

    if (::write(del_fd, deleted_docs_list.data(), sizeof(segment::offset_t) * deleted_docs->GetSize()) == -1) {
        std::string err_msg = "Failed to write to file" + del_file_path + ", error: " + std::strerror(errno);
        ENGINE_LOG_ERROR << err_msg;
        throw Exception(SERVER_WRITE_ERROR, err_msg);
    }

    if (::close(del_fd) == -1) {
        std::string err_msg = "Failed to close file: " + del_file_path + ", error: " + std::strerror(errno);
        ENGINE_LOG_ERROR << err_msg;
        throw Exception(SERVER_WRITE_ERROR, err_msg);
    }
}

}  // namespace codec
+10 −0
Original line number Diff line number Diff line
@@ -144,6 +144,8 @@ DefaultVectorsFormat::write(const store::DirectoryPtr& directory_ptr, const segm
        throw Exception(SERVER_CANNOT_CREATE_FILE, err_msg);
    }

    auto start = std::chrono::high_resolution_clock::now();

    if (::write(rv_fd, vectors->GetData().data(), vectors->GetData().size()) == -1) {
        std::string err_msg = "Failed to write to file" + rv_file_path + ", error: " + std::strerror(errno);
        ENGINE_LOG_ERROR << err_msg;
@@ -155,6 +157,11 @@ DefaultVectorsFormat::write(const store::DirectoryPtr& directory_ptr, const segm
        throw Exception(SERVER_WRITE_ERROR, err_msg);
    }

    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
    ENGINE_LOG_DEBUG << "Writing raw vectors took " << diff.count() << " s";

    start = std::chrono::high_resolution_clock::now();
    if (::write(uid_fd, vectors->GetUids().data(), sizeof(segment::doc_id_t) * vectors->GetCount()) == -1) {
        std::string err_msg = "Failed to write to file" + uid_file_path + ", error: " + std::strerror(errno);
        ENGINE_LOG_ERROR << err_msg;
@@ -165,6 +172,9 @@ DefaultVectorsFormat::write(const store::DirectoryPtr& directory_ptr, const segm
        ENGINE_LOG_ERROR << err_msg;
        throw Exception(SERVER_WRITE_ERROR, err_msg);
    }
    end = std::chrono::high_resolution_clock::now();
    diff = end - start;
    ENGINE_LOG_DEBUG << "Writing uids took " << diff.count() << " s";
}

void
+2 −2
Original line number Diff line number Diff line
@@ -38,10 +38,10 @@ class MemManager {
    DeleteVectors(const std::string& table_id, int64_t length, const IDNumber* vector_ids, uint64_t lsn) = 0;

    virtual Status
    Flush(const std::string& table_id) = 0;
    Flush(const std::string& table_id, bool apply_delete = true) = 0;

    virtual Status
    Flush(std::set<std::string>& table_ids) = 0;
    Flush(std::set<std::string>& table_ids, bool apply_delete = true) = 0;

    //    virtual Status
    //    Serialize(std::set<std::string>& table_ids) = 0;
+8 −6
Original line number Diff line number Diff line
@@ -37,7 +37,8 @@ MemManagerImpl::InsertVectors(const std::string& table_id, int64_t length, const
    flushed_tables.clear();
    if (GetCurrentMem() > options_.insert_buffer_size_) {
        ENGINE_LOG_DEBUG << "Insert buffer size exceeds limit. Performing force flush";
        auto status = Flush(flushed_tables);
        // TODO(zhiru): Don't apply delete here in order to avoid possible concurrency issues with Merge
        auto status = Flush(flushed_tables, false);
        if (!status.ok()) {
            return status;
        }
@@ -62,7 +63,8 @@ MemManagerImpl::InsertVectors(const std::string& table_id, int64_t length, const
    flushed_tables.clear();
    if (GetCurrentMem() > options_.insert_buffer_size_) {
        ENGINE_LOG_DEBUG << "Insert buffer size exceeds limit. Performing force flush";
        auto status = Flush(flushed_tables);
        // TODO(zhiru): Don't apply delete here in order to avoid possible concurrency issues with Merge
        auto status = Flush(flushed_tables, false);
        if (!status.ok()) {
            return status;
        }
@@ -126,7 +128,7 @@ MemManagerImpl::DeleteVectors(const std::string& table_id, int64_t length, const
}

Status
MemManagerImpl::Flush(const std::string& table_id) {
MemManagerImpl::Flush(const std::string& table_id, bool apply_delete) {
    ToImmutable(table_id);
    // TODO: There is actually only one memTable in the immutable list
    MemList temp_immutable_list;
@@ -139,7 +141,7 @@ MemManagerImpl::Flush(const std::string& table_id) {
    auto max_lsn = GetMaxLSN(temp_immutable_list);
    for (auto& mem : temp_immutable_list) {
        ENGINE_LOG_DEBUG << "Flushing table: " << mem->GetTableId();
        auto status = mem->Serialize(max_lsn);
        auto status = mem->Serialize(max_lsn, apply_delete);
        if (!status.ok()) {
            ENGINE_LOG_ERROR << "Flush table " << mem->GetTableId() << " failed";
            return status;
@@ -151,7 +153,7 @@ MemManagerImpl::Flush(const std::string& table_id) {
}

Status
MemManagerImpl::Flush(std::set<std::string>& table_ids) {
MemManagerImpl::Flush(std::set<std::string>& table_ids, bool apply_delete) {
    ToImmutable();

    MemList temp_immutable_list;
@@ -165,7 +167,7 @@ MemManagerImpl::Flush(std::set<std::string>& table_ids) {
    auto max_lsn = GetMaxLSN(temp_immutable_list);
    for (auto& mem : temp_immutable_list) {
        ENGINE_LOG_DEBUG << "Flushing table: " << mem->GetTableId();
        auto status = mem->Serialize(max_lsn);
        auto status = mem->Serialize(max_lsn, apply_delete);
        if (!status.ok()) {
            ENGINE_LOG_ERROR << "Flush table " << mem->GetTableId() << " failed";
            return status;
Loading