ES 7.0.0的简单使用操作
本文是基于ES 7.0.0的简单使用操作。
大部分是一些RESTFUL API的使用,内容比较基础且杂乱,仅仅是一个记录。
ES 7.0.0版本去除了类型type的概念,每个index下默认创建一个类型_doc
创建索引示例:
创建一个名为laws的索引,指定默认分词器为ik_max_word,并过滤掉文档中的html标签,同时定义文档结构mappingPUT /laws { "settings": { "number_of_shards" : 1, "number_of_replicas" : 0, "analysis.analyzer.default.type":"ik_max_word", "analysis.char_filter":["html_strip"] }, "mappings": { "properties": { "title":{ "type": "text" }, "DocNo":{ "type": "text" }, "unit":{ "type": "text" }, "content":{ "type": "text" } } } }为每个字段设置分词器analyzer以及搜索分词器search_analyzer
PUT /laws20190718 { "settings": { "number_of_shards" : 1, "number_of_replicas" : 0, "analysis.char_filter":["html_strip"] }, "mappings": { "properties": { "id":{ "type":"keyword" }, "title":{ "type": "text", "analyzer": "ik_max_word", "search_analyzer": "ik_smart" }, "strs":{ "type": "text", "analyzer": "ik_max_word", "search_analyzer": "ik_smart" }, "category":{ "type": "text", "analyzer": "ik_max_word", "search_analyzer": "ik_smart" }, "date":{ "type":"date" } } } }写入单条数据,文档内容含有英文双引号的,需要使用三引号”””包围在文档内容两侧
PUT laws/_doc/1 { "tid" : "1", "content" : """<p align="center">2019</p>""" }批量写入文档数据,文档内容含有英文双引号的,需要使用三引号”””包围在文档内容两侧
PUT /laws/_bulk {"index":{"_id" : 2}} {"tid" : "1","content" : """<p align="center">2019</p>"""} {"index":{"_id" : 3}} {"tid" : "1","content" : """<p align="center">2020</p>"""} {"index":{"_id" : 4}} {"tid" : "1","content" : """<p align="center">2021</p>"""}简单搜索match
参数:from 从指定的偏移量中提取搜索结果,默认为 0
参数:size 返回搜索结果条数,默认为 10GET /laws/_search?from=0&size=10 { "query": { "match": { "content": "采购" } } }ik分词模式
- ik_smart 最粗粒度的拆分
- ik_max_word 将文本做最细粒度的拆分,会穷尽各种可能的词语组合
结巴分词模式
- jieba_search 倾向于完整、顺序的切分,类似于ik_smart
- jieba_index 倾向于分出更多可能的词,类似于ik_max_word
测试分词引擎
GET _analyze { "text" : "我爱中华人民共和国", "analyzer": "ik_smart" } ik_smart返回结果 { "tokens" : [ { "token" : "我", "start_offset" : 0, "end_offset" : 1, "type" : "CN_CHAR", "position" : 0 }, { "token" : "爱", "start_offset" : 1, "end_offset" : 2, "type" : "CN_CHAR", "position" : 1 }, { "token" : "中华人民共和国", "start_offset" : 2, "end_offset" : 9, "type" : "CN_WORD", "position" : 2 } ] } GET _analyze { "text" : "我爱中华人民共和国", "analyzer": "ik_max_word" } ik_max_word返回结果 { "tokens" : [ { "token" : "我", "start_offset" : 0, "end_offset" : 1, "type" : "CN_CHAR", "position" : 0 }, { "token" : "爱", "start_offset" : 1, "end_offset" : 2, "type" : "CN_CHAR", "position" : 1 }, { "token" : "中华人民共和国", "start_offset" : 2, "end_offset" : 9, "type" : "CN_WORD", "position" : 2 }, { "token" : "中华人民", "start_offset" : 2, "end_offset" : 6, "type" : "CN_WORD", "position" : 3 }, { "token" : "中华", "start_offset" : 2, "end_offset" : 4, "type" : "CN_WORD", "position" : 4 }, { "token" : "华人", "start_offset" : 3, "end_offset" : 5, "type" : "CN_WORD", "position" : 5 }, { "token" : "人民共和国", "start_offset" : 4, "end_offset" : 9, "type" : "CN_WORD", "position" : 6 }, { "token" : "人民", "start_offset" : 4, "end_offset" : 6, "type" : "CN_WORD", "position" : 7 }, { "token" : "共和国", "start_offset" : 6, "end_offset" : 9, "type" : "CN_WORD", "position" : 8 }, { "token" : "共和", "start_offset" : 6, "end_offset" : 8, "type" : "CN_WORD", "position" : 9 }, { "token" : "国", "start_offset" : 8, "end_offset" : 9, "type" : "CN_CHAR", "position" : 10 } ] } GET _analyze { "text" : "我爱中华人民共和国", "analyzer": "jieba_index" } jieba_index返回结果 { "tokens" : [ { "token" : "我爱", "start_offset" : 0, "end_offset" : 2, "type" : "word", "position" : 0 }, { "token" : "中华", "start_offset" : 2, "end_offset" : 4, "type" : "word", "position" : 1 }, { "token" : "中华人民共和国", "start_offset" : 2, "end_offset" : 9, "type" : "word", "position" : 1 }, { "token" : "华人", "start_offset" : 3, "end_offset" : 5, "type" : "word", "position" : 1 }, { "token" : "人民", "start_offset" : 4, "end_offset" : 6, "type" : "word", "position" : 2 }, { "token" : "共和", "start_offset" : 6, "end_offset" : 8, "type" : "word", "position" : 3 }, { "token" : "共和国", "start_offset" : 6, "end_offset" : 9, "type" : "word", "position" : 3 } ] } GET _analyze { "text" : "我爱中华人民共和国", "analyzer": "jieba_search" } jieba_search返回结果 { "tokens" : [ { "token" : "我爱", "start_offset" : 0, "end_offset" : 2, "type" : "word", "position" : 0 }, { "token" : "中华人民共和国", "start_offset" : 2, "end_offset" : 9, "type" : "word", "position" : 1 } ] }多字段检索multi_match
GET /lawss/_search { "query": { "multi_match": { "query": "spark", "fields": ["title","strs"] } } }结果关键词高亮
GET /lawss/_search { "query": { "multi_match": { "query": "数据仓库", "fields": ["title","strs"] } }, "highlight": { "pre_tags": ["<b>"], "post_tags": ["</b>"], "fields": { "title": {}, "strs": {} } } }简单SQL查询
POST /_sql { "query": "SELECT title,category FROM lawss WHERE date > '2018-01-01'" } # 加入format=txt参数可以将json结果转为表格形式 # 支持的返回格式:csv、json(默认)、tsv、txt、yaml、cbor(二进制)、smile(二进制) # 设置“fetch_size”数值可控制返回记录数(可在SQL语句中添加LIMIT控制返回记录数) # 默认每次请求提取1000条记录 POST /_sql?format=txt { "query": "SELECT title,category FROM lawss WHERE date > '2018-01-01'", "fetch_size" : 5 }索引别名,重建索引时可不影响现有业务正常运行。官方文档
# 添加别名 POST /_aliases { "actions": [ { "add": { "index": "lawss", "alias": "laws" } } ] } # 删除别名 POST /_aliases { "actions": [ { "remove": { "index": "lawss", "alias": "laws" } } ] }重建索引reindex
需要预先创建好新索引的设置以及映射等。官方文档# 将索引"lawss"重建至索引"laws20190718" POST _reindex { "source": { "index": "lawss" }, "dest": { "index": "laws20190718" } }自定义检索返回字段
如下所示,只返回ID、NAME、TYPE这三个字段的数据GET /audit_law/_search { "query": { "multi_match": { "query": "项目", "fields": [] } }, "_source": ["ID","NAME","TYPE"] }多条件组合查询
GET /audit_law/_search { "query": { "bool": { "must": [ {"multi_match": {"query": "专项资金","fields": []}} ], "filter": [ {"term": {"RELEASE_ORGAN": {"value": "财政"}}}, {"term": {"REGION_NAME": {"value": "全国"}}}, {"term": {"NAME": {"value": "财政"}}}, {"term": {"DOCUMENT_NO": {"value": "2018年12月28日"}}} , {"term": {"INDUSTRY_NAME": "教育"}}, {"terms": {"LAW_TYPE_CODE": ["10805","10303"]}} ] } }, "sort": [ { "PUBLISH_TIME": { "order": "desc" } } ], "_source": ["ID","NAME","TYPE","INDUSTRY_NAME", "RELEASE_ORGAN","DOCUMENT_NO","REGION_NAME"] }