HOME> 世界杯第二> 阿里havenask之indexlib

阿里havenask之indexlib

世界杯第二 2025-05-13 15:09:29
GitHub - alibaba/havenask havenask/aios/indexlib at main · alibaba/havenask · GitHub indexlib是其中核心的索引库。本文将用脑图详解其设计。 倒排索引:context-->doc...

GitHub - alibaba/havenask

havenask/aios/indexlib at main · alibaba/havenask · GitHub

indexlib是其中核心的索引库。本文将用脑图详解其设计。

倒排索引:context-->docid 正排索引:docid-->(Attribute/Schema) KV索引: key-->value

测试用例

#include "gtest/gtest.h"

#include

#include "autil/legacy/any_jsonizable.h"

#include "glog/logging.h"

#include "autil/legacy/jsonizable.h"

#include "config/index_partition_schema.h"

#include "index_base/schema_adapter.h"

#include "storage/file_system_wrapper.h"

#include "document/raw_document.h"

#include "document/raw_document/default_raw_document.h"

#include "document/extend_document/indexlib_extend_document.h"

#include "document/index_document/normal_document/index_document.h"

#include "document/index_document/normal_document/normal_document.h"

#include "document/index_document/normal_document/attribute_document.h"

#include "document/index_document/normal_document/summary_document.h"

#include "document/document_factory_wrapper.h"

#include "document/document_parser.h"

#include "document/document_parser/normal_parser/normal_document_parser.h"

#include "config/field_schema.h"

#include "partition/index_builder.h"

#include "util/memory_control/quota_control.h"

#include "util/memory_control/memory_quota_controller.h"

#include "partition/online_partition.h"

#include "util/term.h"

#include "partition/index_partition_reader.h"

#include "index/normal/summary/summary_reader.h"

#include "document/index_document/normal_document/search_summary_document.h"

#include "alog/Configurator.h"

static alog::Logger *_logger;

namespace ajson = autil::legacy;

class MySchema : public autil::legacy::Jsonizable

{

public:

void Jsonize(autil::legacy::Jsonizable::JsonWrapper& json) override

{

json.Jsonize("name", name);

json.Jsonize("age", age);

}

void Print(){

LOG(INFO) << "name is: " << name << " age: " << age;

}

private:

std::string name;

int age;

};

TEST(TestBuilder, test_json) {

alog::Configurator::configureLogger("./logger.conf");

_logger = alog::Logger::getRootLogger();

// IE_ROOT_LOG_SETLEVEL(DEBUG);

LOG(INFO) << "test json...";

std::string raw_json = R"(

{

"name" : "hello",

"age" : 3

}

)";

LOG(INFO) << "raw json: " << raw_json;

MySchema ms;

autil::legacy::FromJsonString(ms, raw_json);

ms.Print();

// LOG(INFO) << ms;

}

static std::string GetSchema() {

std::string mJsonStringHead = R"(

{

"table_name": "mainse_summary",

"table_type": "normal",

"fields": [

{ "field_name": "quantity", "field_type": "INTEGER" },

{ "field_name": "provcity", "compress_type": "uniq|equal", "field_type": "STRING" },

{ "field_name": "category", "field_type": "INTEGER" },

{ "field_name": "nid", "field_type": "STRING" },

{ "field_name": "zk_time", "field_type": "STRING" },

{ "field_name": "title", "field_type": "STRING" },

{ "field_name": "user", "field_type": "STRING" },

{ "field_name": "user_id", "field_type": "STRING" },

{ "field_name": "vip", "field_type": "STRING" },

{ "field_name": "ends", "field_type": "STRING" },

{ "field_name": "pid", "field_type": "STRING" },

{ "field_name": "nick", "field_type": "STRING" },

{ "field_name":"int32_multi", "field_type":"int32", "multi_value": true}

],

"indexs": [

{

"index_fields": "nid",

"index_name": "pk",

"index_type": "PRIMARYKEY64",

"pk_hash_type": "default_hash",

"pk_storage_type": "hash_table"

}

],

"attributes" : [

{

"pack_name" : "pack_attr",

"sub_attributes" : ["category", "int32_multi"]

},

{

"pack_name" : "uniq_pack_attr",

"sub_attributes" : ["nick", "pid"],

"compress_type" : "uniq"

},

"quantity", "provcity", "vip"

],

)";

std::string mJsonStringTail = R"(

}

)";

std::string jsonString = mJsonStringHead + R"(

"summarys": {

"summary_fields": [ "nid", "title", "pid", "provcity", "category" ]

}

)" + mJsonStringTail;

return jsonString;

}

static indexlib::config::IndexPartitionSchemaPtr global_schema;

TEST(TestBuilder, load_schema) {

LOG(INFO) << "test load schema";

std::string schema_json = GetSchema();

LOG(INFO) << schema_json;

//1

indexlib::config::IndexPartitionSchemaPtr schema(new indexlib::config::IndexPartitionSchema("myschema"));

FromJsonString(*schema, schema_json);

//2

ASSERT_NO_THROW(indexlib::index_base::SchemaAdapter::LoadSchema(schema_json, schema));

//3

indexlib::storage::FileSystemWrapper::Delete("/tmp/myschema.json", true);

indexlib::storage::FileSystemWrapper::AtomicStore("/tmp/myschema.json", schema_json);

schema = indexlib::index_base::SchemaAdapter::LoadSchema("/tmp", "myschema.json");

ASSERT_TRUE(schema);

//schema field

LOG(INFO) << "field count:" << schema->GetFieldSchema()->GetFieldCount();

LOG(INFO) << "index count:" << schema->GetIndexSchema()->GetIndexCount();

// LOG(INFO) << "index count:" << schema->GetSummarySchema()->GetSummaryCount();

LOG(INFO) << "attr count:" << schema->GetAttributeSchema()->GetAttributeCount();

auto attr_schema = schema->GetAttributeSchema();

for(int i = 0; i < attr_schema->GetAttributeCount(); i++) {

LOG(INFO) << "attr name: " << attr_schema->GetAttributeConfig(i)->GetAttrName()

<< " id: " << attr_schema->GetAttributeConfig(i)->GetAttrId();

}

for(int i = 0; i < attr_schema->GetPackAttributeCount(); i++) {

LOG(INFO) << "pack attr: " << attr_schema->GetPackAttributeConfig(i)->GetAttrName();

std::vector attrNames;

attr_schema->GetPackAttributeConfig(i)->GetSubAttributeNames(attrNames);

for(auto &sub: attrNames) {

LOG(INFO) << "sub name: " << sub;

}

}

auto summary_schema = schema->GetSummarySchema();

// LOG(INFO) << "compress: " << summary_schema->GetSummaryConfig("quantity")->GetFieldConfig()->GetCompressType().GetCompressStr();

// for(int i = 0; i < summary_schema->GetSummaryGroupConfigCount(); i++) {

// auto group = summary_schema->GetSummaryGroupConfig(i);

// LOG(INFO) << "summary group: " << group->GetGroupName()

// << " compress_type: " << group->GetCompressType()

// << " filed_count: " << group->GetSummaryFieldsCount();

// }

global_schema = schema;

}

static indexlib::document::RawDocumentPtr ParseDocStr(const std::string& docStr)

{

const std::string DP_SPATIAL_KEY_VALUE_SEPARATOR = "|";

const std::string DP_KEY_VALUE_SEPARATOR = ",";

const std::string DP_KEY_VALUE_EQUAL_SYMBOL = "=";

const std::string DP_CMD_SEPARATOR = ";";

const std::string DP_TOKEN_SEPARATOR = " ";

const char DP_MULTI_VALUE_SEPARATOR = ' ';

const std::string DP_MAIN_JOIN_FIELD = "main_join";

const std::string DP_SUB_JOIN_FIELD = "sub_join";

std::vector keyValues = autil::StringUtil::split(docStr, DP_SPATIAL_KEY_VALUE_SEPARATOR);

if (keyValues.size() <= 1)

{

keyValues = autil::StringUtil::split(docStr, DP_KEY_VALUE_SEPARATOR);

}

indexlib::document::RawDocumentPtr rawDoc(new indexlib::document::DefaultRawDocument);

for (size_t i = 0; i < keyValues.size(); ++i)

{

std::vector keyValue = autil::StringUtil::split(keyValues[i],

DP_KEY_VALUE_EQUAL_SYMBOL);

std::string key = keyValue[0];

autil::StringUtil::trim(key);

std::string value;

if(keyValue.size() == 2)

{

value = keyValue[1];

autil::StringUtil::trim(value);

}

rawDoc->setField(key, value);

}

return rawDoc;

}

static int64_t GetTsMirco() {

return std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count();

}

static indexlib::document::DocumentPtr global_doc;

TEST(TestBuilder, CreateDoc){

std::string doc_str1 =

"cmd=add,quantity=10,provcity=shanghai,category=1,nid=123,zk_time=1684305956,title=hello1,user=huozai,user_id=98765,vip=svip,ends=some_ends,pid=203,nick=world,int32_multi=2 3;";

std::string doc_str2 =

"cmd=add,quantity=11,provcity=shanghaj,category=2,nid=124,zk_time=1684305957,title=hello2,user=huozaj,user_id=98766,vip=tvip,ends=some_fnds,pid=204,nick=worle,int32_multi=4 5;";

std::string doc_str3 =

"cmd=add,quantity=12,provcity=shanghak,category=3,nid=125,zk_time=1684305958,title=hello3,user=huozak,user_id=98767,vip=ivip,ends=some_gnds,pid=205,nick=worlf,int32_multi=6 7;";

//1 from doc str to raw doc

auto raw_doc = ParseDocStr(doc_str1);

ASSERT_TRUE(raw_doc);

raw_doc->setDocTimestamp(GetTsMirco());

LOG(INFO) << raw_doc->toString();

raw_doc->SetTimestamp(GetTsMirco());

raw_doc->setField("__ts__", std::to_string(raw_doc->GetTimestamp()/1000000 + 10000000));

raw_doc->setDocTimestamp(raw_doc->GetTimestamp());

//2 build normal doc

indexlib::document::IndexlibExtendDocumentPtr extend_doc(new indexlib::document::IndexlibExtendDocument());

extend_doc->setRawDocument(raw_doc);

auto classified_doc = extend_doc->getClassifiedDocument();

LOG(INFO) << "classifiedDoc: pk: " << classified_doc->getPrimaryKey();

LOG(INFO) << "classifiedDoc: index: " << classified_doc->getIndexDocument()->GetPrimaryKey();

LOG(INFO) << "classifiedDoc: attr packfield count: " << classified_doc->getAttributeDoc()->GetPackFieldCount();

LOG(INFO) << "classifiedDoc: summary nonempty count: " << classified_doc->getSummaryDoc()->GetNotEmptyFieldCount();

LOG(INFO) << "raw doc type: " << raw_doc->getDocOperateType();

std::string schema_json = GetSchema();

indexlib::config::IndexPartitionSchemaPtr schema(new indexlib::config::IndexPartitionSchema("myschema"));

FromJsonString(*schema, schema_json);

// schema->SetDefaultTTL(1000000); //TTL

// schema->SetEnableTTL(true, DEFAULT_REGIONID, "__ts__");

indexlib::document::DocumentFactoryWrapper wrapper(schema);

wrapper.Init();

indexlib::document::DocumentParserPtr parser(wrapper.CreateDocumentParser());

auto normal_doc_parser = dynamic_cast(parser.get());

ASSERT_TRUE(normal_doc_parser != nullptr);

indexlib::document::DocumentPtr doc = parser->Parse(extend_doc);

ASSERT_TRUE(doc);

indexlib::document::NormalDocumentPtr normal_doc = DYNAMIC_POINTER_CAST( indexlib::document::NormalDocument, doc);

ASSERT_TRUE(normal_doc);

global_doc = doc;

}

TEST(TestBuilder, TestOfflineBuilder){

LOG(INFO) << "-----------------------begin offline---------------------";

ASSERT_TRUE(global_schema);

ASSERT_TRUE(global_doc);

indexlib::config::IndexPartitionOptionsPtr options(new indexlib::config::IndexPartitionOptions);

options->SetIsOnline(false);

std::string mergeConfigStr = "{\"class_name\":\"default\",\"parameters\":{\"split_num\":\"1\"}}";

autil::legacy::FromJsonString(options->GetMergeConfig().GetSplitSegmentConfig(), mergeConfigStr);

auto& buildConfig = options->GetBuildConfig();

buildConfig.enablePackageFile = false;

buildConfig.maxDocCount = 1024;

buildConfig.ttl = 1000000;

indexlib::util::QuotaControlPtr memoryQuotaControl(new indexlib::util::QuotaControl(1024*1024*100));

// indexlib::storage::FileSystemWrapper::DeleteDir("./tdata/builder");

indexlib::partition::IndexBuilderPtr builder(new indexlib::partition::IndexBuilder("./tdata/builder", *options, global_schema, memoryQuotaControl));

ASSERT_TRUE(builder->Init());

for(int i = 0; i < 1000; i++) {

ASSERT_TRUE(builder->Build(global_doc));

}

ASSERT_TRUE(builder->Merge(*options));

builder->EndIndex();

}

TEST(TestBuilder, TestOnlineBuild){

LOG(INFO) << "-----------------------begin online---------------------";

ASSERT_TRUE(global_schema);

ASSERT_TRUE(global_doc);

indexlib::config::IndexPartitionOptionsPtr options(new indexlib::config::IndexPartitionOptions);

options->SetIsOnline(true);

std::string mergeConfigStr = "{\"class_name\":\"default\",\"parameters\":{\"split_num\":\"1\"}}";

autil::legacy::FromJsonString(options->GetMergeConfig().GetSplitSegmentConfig(), mergeConfigStr);

auto& buildConfig = options->GetBuildConfig();

buildConfig.enablePackageFile = false;

buildConfig.maxDocCount = 1024;

buildConfig.ttl = 1000000;

indexlib::util::MemoryQuotaControllerPtr quotaControl(new indexlib::util::MemoryQuotaController(100*1024*1024));

indexlib::partition::OnlinePartitionPtr part(new indexlib::partition::OnlinePartition("online", quotaControl));

options->GetOnlineConfig().onDiskFlushRealtimeIndex = true;

options->GetOnlineConfig().maxRealtimeDumpInterval = 10000;

options->GetOnlineConfig().maxRealtimeMemSize = 100*1024*1024;

auto rs = part->Open("./tdata/builder", "", global_schema, *options);

ASSERT_EQ(rs, indexlib::partition::IndexPartition::OS_OK);

//todo use IndexPartitionCreator::Create create partition

indexlib::util::QuotaControlPtr memoryQuotaControlOnline(new indexlib::util::QuotaControl(100*1024*1024));

indexlib::partition::IndexBuilderPtr online_builder(new indexlib::partition::IndexBuilder(part, memoryQuotaControlOnline));

ASSERT_TRUE(online_builder->Init());

//online add doc

for(int i = 0; i < 100; i++) {

ASSERT_TRUE(online_builder->Build(global_doc));

}

online_builder->EndIndex();

LOG(INFO) << "online add doc ok";

//query

auto partReader = part->GetReader();

ASSERT_TRUE(partReader);

auto indexReader = partReader->GetIndexReader();

ASSERT_TRUE(indexReader);

indexlib::util::Term t("123", "pk");

auto ite = indexReader->Lookup(t);

ASSERT_TRUE(ite != nullptr);

auto docid = ite->SeekDoc(INVALID_DOCID);

LOG(INFO) << "type: " << ite->GetMatchValueType() << " dockid: " << docid;

LOG(INFO) << "enable: " << _logger->isLevelEnabled(alog::LOG_LEVEL_INFO);

IE_LOG(ERROR, "hello ie log");

ALOG_ERROR(_logger, "hello :%s", "world");

auto summaryReader = partReader->GetSummaryReader();

ASSERT_TRUE(summaryReader != nullptr);

indexlib::document::SearchSummaryDocument summaryDoc(NULL, 40960);

ASSERT_TRUE(summaryReader->GetDocument(docid, &summaryDoc));

int count = 3;

LOG(INFO) << "field count: " << count;

for(int i = 0; i < count; i++) {

const autil::ConstString* field = summaryDoc.GetFieldValue(i);

ASSERT_TRUE(field != nullptr);

LOG(INFO) << "value is: " << *field;

}

}

LOG配置

logger.conf

alog.rootLogger=INFO, indexlibAppender

alog.max_msg_len=2000000

alog.appender.indexlibAppender=ConsoleAppender

#alog.appender.indexlibAppender=FileAppender

#alog.appender.indexlibAppender.fileName=TestLog.log

alog.appender.indexlibAppender.flush=true

alog.appender.indexlibAppender.layout=PatternLayout

#alog.appender.indexlibAppender.layout.LogPattern=[%%d] [%%t], %%f() [%%n] [%%l] [%%m]

alog.appender.indexlibAppender.layout.LogPattern=[%%h][xxxx][%%d][%%l][%%t][%%p][%%F:%%n %%f] : [%%m]

alog.logger.indexlib=INFO

inherit.indexlib.test=false

alog.logger.local.LocalFileSystem=INFO

alog.logger.ErrorLogCollector=TRACE1,ErrorLogCollectorAppender

inherit.ErrorLogCollector=false

alog.appender.ErrorLogCollectorAppender=FileAppender

alog.appender.ErrorLogCollectorAppender.fileName=error_log_collector.log

alog.appender.ErrorLogCollectorAppender.flush=true

alog.appender.ErrorLogCollectorAppender.max_file_size=100

alog.appender.ErrorLogCollectorAppender.layout=PatternLayout

alog.appender.ErrorLogCollectorAppender.layout.LogPattern=[%%h][xxxx][%%d][%%l][%%t][%%p][%%F:%%n %%f] : [%%m]

alog.appender.ErrorLogCollectorAppender.compress=true

alog.appender.ErrorLogCollectorAppender.log_keep_count=100

# alog.logger.indexlib.test=INFO, indexlibTestAppender

# alog.appender.indexlibTestAppender=ConsoleAppender

# alog.appender.indexlibTestAppender.layout=PatternLayout

# alog.appender.indexlibTestAppender.layout.LogPattern=[%%d] [%%t/%%p,PSM:%%f():%%n] [%%m]

# inherit.indexlib.test=false

生成数据在磁盘的结构

├── deploy_meta.0

├── deploy_meta.1

├── index_format_version

├── __indexlib_fs_root_link__@1684478171 -> ./tdata/builder

├── join_index_partition

├── merge_resource

│ └── version.1

├── rt_index_partition

│ ├── segment_1073741824_level_0

│ │ ├── attribute

│ │ │ ├── pack_attr

│ │ │ │ ├── data

│ │ │ │ ├── data_info

│ │ │ │ └── offset

│ │ │ ├── provcity

│ │ │ │ ├── data

│ │ │ │ ├── data_info

│ │ │ │ └── offset

│ │ │ ├── quantity

│ │ │ │ └── data

│ │ │ ├── uniq_pack_attr

│ │ │ │ ├── data

│ │ │ │ ├── data_info

│ │ │ │ └── offset

│ │ │ └── vip

│ │ │ ├── data

│ │ │ ├── data_info

│ │ │ └── offset

│ │ ├── counter

│ │ ├── deletionmap

│ │ │ └── data_1073741824

│ │ ├── deploy_index

│ │ ├── index

│ │ │ ├── pk

│ │ │ │ └── data

│ │ │ └── virtual_timestamp_index

│ │ │ ├── dictionary

│ │ │ ├── index_format_option

│ │ │ └── posting

│ │ ├── operation_log

│ │ │ ├── data

│ │ │ └── meta

│ │ ├── segment_file_list

│ │ ├── segment_info

│ │ ├── segment_metrics

│ │ └── summary

│ │ ├── data

│ │ └── offset

│ └── version.0

├── schema.json

├── segment_0_level_0

│ ├── attribute

│ │ ├── pack_attr

│ │ │ ├── data

│ │ │ ├── data_info

│ │ │ └── offset

│ │ ├── provcity

│ │ │ ├── data

│ │ │ ├── data_info

│ │ │ └── offset

│ │ ├── quantity

│ │ │ └── data

│ │ ├── uniq_pack_attr

│ │ │ ├── data

│ │ │ ├── data_info

│ │ │ └── offset

│ │ └── vip

│ │ ├── data

│ │ ├── data_info

│ │ └── offset

│ ├── counter

│ ├── deletionmap

│ │ └── data_0

│ ├── deploy_index

│ ├── index

│ │ └── pk

│ │ └── data

│ ├── segment_file_list

│ ├── segment_info

│ ├── segment_metrics

│ └── summary

│ ├── data

│ └── offset

├── segment_1_level_0

│ ├── attribute

│ │ ├── pack_attr

│ │ │ ├── category

│ │ │ ├── data

│ │ │ ├── data_info

│ │ │ ├── int32_multi

│ │ │ └── offset

│ │ ├── provcity

│ │ │ ├── data

│ │ │ ├── data_info

│ │ │ └── offset

│ │ ├── quantity

│ │ │ └── data

│ │ ├── uniq_pack_attr

│ │ │ ├── data

│ │ │ ├── data_info

│ │ │ └── offset

│ │ └── vip

│ │ ├── data

│ │ ├── data_info

│ │ └── offset

│ ├── counter

│ ├── deletionmap

│ ├── deploy_index

│ ├── index

│ │ └── pk

│ │ └── data

│ ├── segment_file_list

│ ├── segment_info

│ ├── segment_metrics

│ └── summary

│ ├── data

│ └── offset

├── summary_info

│ ├── index_summary.0

│ └── index_summary.1

├── version.0

└── version.1

分层架构

脑图详解