반응형
Recent Posts
Recent Comments
관리 메뉴

개발잡부

[tensorflow 2] universal-sentence-encoder-multilingual 본문

Python/text embeddings

[tensorflow 2] universal-sentence-encoder-multilingual

닉의네임 2022. 1. 11. 10:54
반응형

한국어 포함

 

이걸 테스트 함

https://tfhub.dev/google/universal-sentence-encoder-multilingual/3

 

#가상환경  목록확인
conda info --envs 

#가상환경 생성
conda create --name "text" python="3.7"

require.txt

elasticsearch
numpy
tensorflow
tensorflow-hub
tensorflow_text
kss
regex
pip install -r require.txt

 

put_data.py

# -*- coding: utf-8 -*-

import json

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

import tensorflow_hub as hub
import tensorflow_text
import kss, numpy


##### INDEXING #####

def index_data():
    print("Creating the 'korquad' index.")
    client.indices.delete(index=INDEX_NAME, ignore=[404])

    with open(INDEX_FILE) as index_file:
        source = index_file.read().strip()
        client.indices.create(index=INDEX_NAME, body=source)

    count = 0

    with open(DATA_FILE) as data_file:
        for line in data_file:
            line = line.strip()

            json_data = json.loads(line)

            docs = []
            for j in json_data:
                count += 1

                docs.append(j)
                if count % BATCH_SIZE == 0:
                    index_batch(docs)
                    docs = []
                    print("Indexed {} documents.".format(count))

            if docs:
                index_batch(docs)
                print("Indexed {} documents.".format(count))

    client.indices.refresh(index=INDEX_NAME)
    print("Done indexing.")


def paragraph_index(paragraph):
    # 문장단위 분리
    avg_paragraph_vec = numpy.zeros((1, 512))
    sent_count = 0

    for sent in kss.split_sentences(paragraph[0:100]):
        # 문장을 embed 하기
        # vector들을 평균으로 더해주기
        avg_paragraph_vec += embed_text([sent])
        sent_count += 1
    avg_paragraph_vec /= sent_count
    return avg_paragraph_vec.ravel(order='C')


def index_batch(docs):
    titles = [doc["title"] for doc in docs]
    title_vectors = embed_text(titles)
    paragraph_vectors = [paragraph_index(doc["paragraph"]) for doc in docs]
    requests = []
    for i, doc in enumerate(docs):
        request = doc
        request["_op_type"] = "index"
        request["_index"] = INDEX_NAME
        request["title_vector"] = title_vectors[i]
        request["paragraph_vector"] = paragraph_vectors[i]
        requests.append(request)
    bulk(client, requests)


##### EMBEDDING #####

def embed_text(input):
    vectors = model(input)
    return [vector.numpy().tolist() for vector in vectors]


##### MAIN SCRIPT #####

if __name__ == '__main__':
    INDEX_NAME = "korquad"
    INDEX_FILE = "./index.json"

    DATA_FILE = "./KorQuAD_v1.0_train_convert.json"
    BATCH_SIZE = 100

    SEARCH_SIZE = 3

    print("Downloading pre-trained embeddings from tensorflow hub...")
    module_url = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
    print("module %s loaded" % module_url)
    model = hub.load(module_url)

    # client = Elasticsearch()
    client = Elasticsearch(http_auth=('elastic', 'datalake'))

    index_data()

    print("Done.")

 

 

 

search.py

# -*- coding: utf-8 -*-

import time

from elasticsearch import Elasticsearch

import tensorflow_hub as hub
import tensorflow_text

##### SEARCHING #####

def run_query_loop():
    while True:
        try:
            handle_query()
        except KeyboardInterrupt:
            return

def handle_query():
    query = input("Enter query: ")

    embedding_start = time.time()
    query_vector = embed_text([query])[0]
    embedding_time = time.time() - embedding_start

    script_query = {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, doc['paragraph_vector']) + 1.0",
                "params": {"query_vector": query_vector}
            }
        }
    }

    search_start = time.time()
    response = client.search(
        index=INDEX_NAME,
        body={
            "size": SEARCH_SIZE,
            "query": script_query,
            "_source": {"includes": ["title", "paragraph"]}
        }
    )
    search_time = time.time() - search_start

    print()
    print("{} total hits.".format(response["hits"]["total"]["value"]))
    print("embedding time: {:.2f} ms".format(embedding_time * 1000))
    print("search time: {:.2f} ms".format(search_time * 1000))
    for hit in response["hits"]["hits"]:
        print("id: {}, score: {}".format(hit["_id"], hit["_score"]))
        print(hit["_source"])
        print()

##### EMBEDDING #####

def embed_text(input):
    vectors = model(input)
    return [vector.numpy().tolist() for vector in vectors]

##### MAIN SCRIPT #####

if __name__ == '__main__':
    INDEX_NAME = "korquad"
    INDEX_FILE = "../data/posts/index.json"

    SEARCH_SIZE = 3

    print("Downloading pre-trained embeddings from tensorflow hub...")
    module_url = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
    print("module %s loaded" % module_url)
    model = hub.load(module_url)

    client = Elasticsearch(http_auth=('elastic', 'datalake'))

    run_query_loop()

    print("Done.")
반응형
Comments