Unverified Commit 46440076 authored by shengjun.li's avatar shengjun.li Committed by GitHub
Browse files

#1928 Too many data and uid copies when loading files (#1931)



Signed-off-by: default avatarshengjun.li <shengjun.li@zilliz.com>

Co-authored-by: default avatarJin Hai <hai.jin@zilliz.com>
parent 7ed6edc5
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -33,6 +33,7 @@ Please mark all change in change log and use the issue from GitHub
-   \#1885 Optimize knowhere unittest
-   \#1886 Refactor log on search and insert request
-   \#1897 Heap pop and push can be realized by heap_swap_top
-   \#1928 Fix too many data and uid copies when loading files
-   \#1930 Upgrade mishards to 0.8.0

## Task
+3 −6
Original line number Diff line number Diff line
@@ -90,15 +90,12 @@ DefaultVectorsFormat::read(const storage::FSHandlerPtr& fs_ptr, segment::Vectors
    for (; it != it_end; ++it) {
        const auto& path = it->path();
        if (path.extension().string() == raw_vector_extension_) {
            std::vector<uint8_t> vector_list;
            auto& vector_list = vectors_read->GetMutableData();
            read_vectors_internal(fs_ptr, path.string(), 0, INT64_MAX, vector_list);
            vectors_read->AddData(vector_list);
            vectors_read->SetName(path.stem().string());
        }
        if (path.extension().string() == user_id_extension_) {
            std::vector<segment::doc_id_t> uids;
        } else if (path.extension().string() == user_id_extension_) {
            auto& uids = vectors_read->GetMutableUids();
            read_uids_internal(fs_ptr, path.string(), uids);
            vectors_read->AddUids(uids);
        }
    }
}
+6 −10
Original line number Diff line number Diff line
@@ -375,8 +375,6 @@ ExecutionEngineImpl::Serialize() {

Status
ExecutionEngineImpl::Load(bool to_cache) {
    // TODO(zhiru): refactor

    index_ = std::static_pointer_cast<knowhere::VecIndex>(cache::CpuCacheMgr::GetInstance()->GetIndex(location_));
    bool already_in_cache = (index_ != nullptr);
    if (!already_in_cache) {
@@ -411,21 +409,19 @@ ExecutionEngineImpl::Load(bool to_cache) {
            auto& vectors = segment_ptr->vectors_ptr_;
            auto& deleted_docs = segment_ptr->deleted_docs_ptr_->GetDeletedDocs();

            auto vectors_uids = vectors->GetUids();
            auto& vectors_uids = vectors->GetMutableUids();
            auto count = vectors_uids.size();
            index_->SetUids(vectors_uids);
            ENGINE_LOG_DEBUG << "set uids " << index_->GetUids().size() << " for index " << location_;

            auto vectors_data = vectors->GetData();
            auto& vectors_data = vectors->GetData();

            faiss::ConcurrentBitsetPtr concurrent_bitset_ptr =
                std::make_shared<faiss::ConcurrentBitset>(vectors->GetCount());
            faiss::ConcurrentBitsetPtr concurrent_bitset_ptr = std::make_shared<faiss::ConcurrentBitset>(count);
            for (auto& offset : deleted_docs) {
                if (!concurrent_bitset_ptr->test(offset)) {
                concurrent_bitset_ptr->set(offset);
            }
            }

            auto dataset = knowhere::GenDataset(vectors->GetCount(), this->dim_, vectors_data.data());
            auto dataset = knowhere::GenDataset(count, this->dim_, vectors_data.data());
            if (index_type_ == EngineType::FAISS_IDMAP) {
                auto bf_index = std::static_pointer_cast<knowhere::IDMAP>(index_);
                bf_index->Train(knowhere::DatasetPtr(), conf);
+10 −4
Original line number Diff line number Diff line
@@ -28,10 +28,6 @@
namespace milvus {
namespace segment {

Vectors::Vectors(std::vector<uint8_t> data, std::vector<doc_id_t> uids, const std::string& name)
    : data_(std::move(data)), uids_(std::move(uids)), name_(name) {
}

void
Vectors::AddData(const std::vector<uint8_t>& data) {
    data_.reserve(data_.size() + data.size());
@@ -120,6 +116,16 @@ Vectors::Erase(std::vector<int32_t>& offsets) {
                     << diff.count() << " s";
}

std::vector<uint8_t>&
Vectors::GetMutableData() {
    return data_;
}

std::vector<doc_id_t>&
Vectors::GetMutableUids() {
    return uids_;
}

const std::vector<uint8_t>&
Vectors::GetData() const {
    return data_;
+6 −2
Original line number Diff line number Diff line
@@ -28,8 +28,6 @@ using doc_id_t = int64_t;

class Vectors {
 public:
    Vectors(std::vector<uint8_t> data, std::vector<doc_id_t> uids, const std::string& name);

    Vectors() = default;

    void
@@ -41,6 +39,12 @@ class Vectors {
    void
    SetName(const std::string& name);

    std::vector<uint8_t>&
    GetMutableData();

    std::vector<doc_id_t>&
    GetMutableUids();

    const std::vector<uint8_t>&
    GetData() const;