반응형
Recent Posts
Recent Comments
관리 메뉴

개발잡부

[es8] kNN vs ANN indexer 본문

ElasticStack8/kNN Search

[es8] kNN vs ANN indexer

닉의네임 2023. 5. 1. 02:26
반응형

900gle crawler 를 통해 수집된 데이터를 DB에 저장 후 json 파일로 저장 json 파일을 읽어 색인 생성 

 

https://ldh-6019.tistory.com/477

 

[python] DB data to json file

https://ldh-6019.tistory.com/476 [python] mysql 연동 - PyMySQL Python의 MySql모듈 설치 Python의 MySql모듈 PyMySQL mysql-connector-python ( >= MySQL 8.0) or mysql.connector (< MySQL 8.0) PyMySQL를 이용하여 접근하는 방법 $ pip install Py

ldh-6019.tistory.com

 

# -*- coding: utf-8 -*-

import json

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

import tensorflow_hub as hub
import tensorflow_text
import kss, numpy


##### INDEXING #####
def index_data():
    print("Creating the '" + INDEX_NAME_A + "' index.")
    print("Creating the '" + INDEX_NAME_B + "' index.")
    print("Creating the '" + INDEX_NAME_C + "' index.")
    client.indices.delete(index=INDEX_NAME_A, ignore=[404])
    client.indices.delete(index=INDEX_NAME_B, ignore=[404])
    client.indices.delete(index=INDEX_NAME_C, ignore=[404])

    with open(INDEX_FILE_A) as index_file:
        source = index_file.read().strip()
        client.indices.create(index=INDEX_NAME_A, body=source)

    with open(INDEX_FILE_B) as index_file:
        source = index_file.read().strip()
        client.indices.create(index=INDEX_NAME_B, body=source)

    with open(INDEX_FILE_C) as index_file:
        source = index_file.read().strip()
        client.indices.create(index=INDEX_NAME_C, body=source)
    count = 0
    docs = []

    with open(DATA_FILE, 'r') as data_file:
        json_object = json.loads(data_file.read())
        for line in json_object:
            docs.append(line)
            count += 1

            if count % BATCH_SIZE == 0:
                index_batch_a(docs)
                index_batch_b(docs)
                index_batch_c(docs)
                docs = []
                print("Indexed {} documents.".format(count))

        if docs:
            index_batch_a(docs)
            index_batch_b(docs)
            index_batch_c(docs)
            print("Indexed {} documents.".format(count))

    client.indices.refresh(index=INDEX_NAME_A)
    client.indices.refresh(index=INDEX_NAME_B)
    client.indices.refresh(index=INDEX_NAME_C)
    print("Done indexing.")

def index_batch_a(docs):
    name = [doc["name"] for doc in docs]
    name_vectors = embed_text(name)
    requests = []
    for i, doc in enumerate(docs):
        request = doc
        request["_op_type"] = "index"
        request["_index"] = INDEX_NAME_A
        request["name_vector"] = name_vectors[i]
        requests.append(request)
    bulk(client, requests)

def index_batch_b(docs):
    name = [doc["name"] for doc in docs]
    name_vectors = embed_text(name)
    requests = []
    for i, doc in enumerate(docs):
        request = doc
        request["_op_type"] = "index"
        request["_index"] = INDEX_NAME_B
        request["name_vector"] = name_vectors[i]
        requests.append(request)
    bulk(client, requests)

def index_batch_c(docs):
    name = [doc["name"] for doc in docs]
    name_vectors = embed_text(name)
    requests = []
    for i, doc in enumerate(docs):
        request = doc
        request["_op_type"] = "index"
        request["_index"] = INDEX_NAME_C
        request["name_vector"] = name_vectors[i]
        requests.append(request)
    bulk(client, requests)

def paragraph_index(paragraph):
    # 문장단위 분리
    avg_paragraph_vec = numpy.zeros((1, 128))
    sent_count = 0
    for sent in kss.split_sentences(paragraph[0:100]):
        # 문장을 embed 하기
        # vector들을 평균으로 더해주기
        avg_paragraph_vec += embed_text([sent])
        sent_count += 1
    avg_paragraph_vec /= sent_count
    return avg_paragraph_vec.ravel(order='C')

##### EMBEDDING #####
def embed_text(input):
    vectors = embed(input)
    return [vector.numpy().tolist() for vector in vectors]

##### MAIN SCRIPT #####
if __name__ == '__main__':
    INDEX_NAME_A = "ann-index"
    INDEX_FILE_A = "./data/products/ann-index.json"

    INDEX_NAME_B = "knn-index"
    INDEX_FILE_B = "./data/products/knn-index.json"

    INDEX_NAME_C = "match-index"
    INDEX_FILE_C = "./data/products/knn-index.json"

    DATA_FILE = "./db/json_data.json"
    BATCH_SIZE = 100
    SEARCH_SIZE = 3

    print("Downloading pre-trained embeddings from tensorflow hub...")
    embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

    client = Elasticsearch(http_auth=('elastic', 'dlengus'))
    index_data()

    print("Done.")

 

ann-index.json

{
  "settings": {
    "number_of_shards": 2,
    "number_of_replicas": 0
  },
   "mappings": {
    "dynamic": "true",
    "_source": {
      "enabled": "true"
    },
    "properties": {

      "name": {
        "type": "text"
      },
      "name_vector": {
        "type": "dense_vector",
        "dims": 512,
        "index": true,
        "similarity": "l2_norm"
      },
      "price": {
        "type": "double"
      },
      "id": {
        "type": "keyword"
      }
    }
  }
}

knn-index.json

{
  "settings": {
    "number_of_shards": 2,
    "number_of_replicas": 0
  },
   "mappings": {
    "dynamic": "true",
    "_source": {
      "enabled": "true"
    },
    "properties": {

      "name": {
        "type": "text"
      },
      "name_vector": {
        "type": "dense_vector",
        "dims": 512
      },
      "price": {
        "type": "double"
      },
      "id": {
        "type": "keyword"
      }
    }
  }
}

 

반응형

'ElasticStack8 > kNN Search' 카테고리의 다른 글

[es8] k-nearest neighor (kNN) search  (0) 2023.05.03
[es8] kNN vs ANN 성능비교  (0) 2022.09.04
[es8] similarity Search  (0) 2022.09.03
Comments