일 | 월 | 화 | 수 | 목 | 금 | 토 |
---|---|---|---|---|---|---|
1 | 2 | 3 | 4 | |||
5 | 6 | 7 | 8 | 9 | 10 | 11 |
12 | 13 | 14 | 15 | 16 | 17 | 18 |
19 | 20 | 21 | 22 | 23 | 24 | 25 |
26 | 27 | 28 | 29 | 30 | 31 |
Tags
- license delete
- high level client
- MySQL
- 파이썬
- License
- sort
- flask
- aggs
- 차트
- analyzer test
- matplotlib
- zip 암호화
- Java
- API
- docker
- aggregation
- zip 파일 암호화
- ELASTIC
- Python
- 900gle
- Elasticsearch
- token filter test
- licence delete curl
- Mac
- query
- TensorFlow
- springboot
- plugin
- Test
- Kafka
Archives
- Today
- Total
개발잡부
[es8] kNN vs ANN indexer 본문
반응형
900gle crawler 를 통해 수집된 데이터를 DB에 저장 후 json 파일로 저장 json 파일을 읽어 색인 생성
https://ldh-6019.tistory.com/477
# -*- coding: utf-8 -*-
import json
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import tensorflow_hub as hub
import tensorflow_text
import kss, numpy
##### INDEXING #####
def index_data():
print("Creating the '" + INDEX_NAME_A + "' index.")
print("Creating the '" + INDEX_NAME_B + "' index.")
print("Creating the '" + INDEX_NAME_C + "' index.")
client.indices.delete(index=INDEX_NAME_A, ignore=[404])
client.indices.delete(index=INDEX_NAME_B, ignore=[404])
client.indices.delete(index=INDEX_NAME_C, ignore=[404])
with open(INDEX_FILE_A) as index_file:
source = index_file.read().strip()
client.indices.create(index=INDEX_NAME_A, body=source)
with open(INDEX_FILE_B) as index_file:
source = index_file.read().strip()
client.indices.create(index=INDEX_NAME_B, body=source)
with open(INDEX_FILE_C) as index_file:
source = index_file.read().strip()
client.indices.create(index=INDEX_NAME_C, body=source)
count = 0
docs = []
with open(DATA_FILE, 'r') as data_file:
json_object = json.loads(data_file.read())
for line in json_object:
docs.append(line)
count += 1
if count % BATCH_SIZE == 0:
index_batch_a(docs)
index_batch_b(docs)
index_batch_c(docs)
docs = []
print("Indexed {} documents.".format(count))
if docs:
index_batch_a(docs)
index_batch_b(docs)
index_batch_c(docs)
print("Indexed {} documents.".format(count))
client.indices.refresh(index=INDEX_NAME_A)
client.indices.refresh(index=INDEX_NAME_B)
client.indices.refresh(index=INDEX_NAME_C)
print("Done indexing.")
def index_batch_a(docs):
name = [doc["name"] for doc in docs]
name_vectors = embed_text(name)
requests = []
for i, doc in enumerate(docs):
request = doc
request["_op_type"] = "index"
request["_index"] = INDEX_NAME_A
request["name_vector"] = name_vectors[i]
requests.append(request)
bulk(client, requests)
def index_batch_b(docs):
name = [doc["name"] for doc in docs]
name_vectors = embed_text(name)
requests = []
for i, doc in enumerate(docs):
request = doc
request["_op_type"] = "index"
request["_index"] = INDEX_NAME_B
request["name_vector"] = name_vectors[i]
requests.append(request)
bulk(client, requests)
def index_batch_c(docs):
name = [doc["name"] for doc in docs]
name_vectors = embed_text(name)
requests = []
for i, doc in enumerate(docs):
request = doc
request["_op_type"] = "index"
request["_index"] = INDEX_NAME_C
request["name_vector"] = name_vectors[i]
requests.append(request)
bulk(client, requests)
def paragraph_index(paragraph):
# 문장단위 분리
avg_paragraph_vec = numpy.zeros((1, 128))
sent_count = 0
for sent in kss.split_sentences(paragraph[0:100]):
# 문장을 embed 하기
# vector들을 평균으로 더해주기
avg_paragraph_vec += embed_text([sent])
sent_count += 1
avg_paragraph_vec /= sent_count
return avg_paragraph_vec.ravel(order='C')
##### EMBEDDING #####
def embed_text(input):
vectors = embed(input)
return [vector.numpy().tolist() for vector in vectors]
##### MAIN SCRIPT #####
if __name__ == '__main__':
INDEX_NAME_A = "ann-index"
INDEX_FILE_A = "./data/products/ann-index.json"
INDEX_NAME_B = "knn-index"
INDEX_FILE_B = "./data/products/knn-index.json"
INDEX_NAME_C = "match-index"
INDEX_FILE_C = "./data/products/knn-index.json"
DATA_FILE = "./db/json_data.json"
BATCH_SIZE = 100
SEARCH_SIZE = 3
print("Downloading pre-trained embeddings from tensorflow hub...")
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")
client = Elasticsearch(http_auth=('elastic', 'dlengus'))
index_data()
print("Done.")
ann-index.json
{
"settings": {
"number_of_shards": 2,
"number_of_replicas": 0
},
"mappings": {
"dynamic": "true",
"_source": {
"enabled": "true"
},
"properties": {
"name": {
"type": "text"
},
"name_vector": {
"type": "dense_vector",
"dims": 512,
"index": true,
"similarity": "l2_norm"
},
"price": {
"type": "double"
},
"id": {
"type": "keyword"
}
}
}
}
knn-index.json
{
"settings": {
"number_of_shards": 2,
"number_of_replicas": 0
},
"mappings": {
"dynamic": "true",
"_source": {
"enabled": "true"
},
"properties": {
"name": {
"type": "text"
},
"name_vector": {
"type": "dense_vector",
"dims": 512
},
"price": {
"type": "double"
},
"id": {
"type": "keyword"
}
}
}
}
반응형
'ElasticStack8 > kNN Search' 카테고리의 다른 글
[es8] k-nearest neighor (kNN) search (0) | 2023.05.03 |
---|---|
[es8] kNN vs ANN 성능비교 (0) | 2022.09.04 |
[es8] similarity Search (0) | 2022.09.03 |
Comments