jieba 加whooh 构建自己本地数据库的搜索引擎
例子
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, ID
from jieba.analyse import ChineseAnalyzer
from whoosh.qparser import QueryParser
import os
analyzer = ChineseAnalyzer()
schema = Schema(title=TEXT(stored=True, analyzer=analyzer), content=TEXT(stored=True, analyzer=analyzer), id=ID(stored=True))
if not os.path.exists("index"):
os.mkdir("index")
ix = create_in("index", schema)
documents = [
{
"title": "下文",
"content": "首先安装jieba和whoosh库,",
"id": "1"
},
{
"title": "中文自然语言处理",
"content": "中文自然语言处理涉及分词、词性标注、命名实体识别等...",
"id": "2"
}
]
writer = ix.writer()
for doc in documents:
writer.add_document(title=doc["title"], content=doc["content"], id=doc["id"])
writer.commit()
searcher = ix.searcher()
query_parser = QueryParser("content", schema=ix.schema)
search_input = "jieba和whoosh"
query = query_parser.parse(search_input)
results = searcher.search(query, limit=None)
print(f"找到 {len(results)} 篇相关文档:")
for result in results:
print(f"{result['id']} - {result['title']}")
实战
from whoosh.index import create_in,open_dir
from whoosh.fields import Schema, TEXT, ID
from jieba.analyse import ChineseAnalyzer
from whoosh.qparser import QueryParser
from whoosh.index import open_dir
import os
import jieba
import pandas as pd
from glob import glob
from multiprocessing import Process, freeze_support
from tqdm import tqdm
class GenVocTensorForDataSet:
def __init__(self):
pass
@staticmethod
def gen_data_tensor(data_v, out_dir, process_count):
"""
:param data_v:
:param out_dir:
:param process_count:
:return:
"""
total_l = []
one_p_count = 0
for one_v in tqdm(data_v):
one_p_count += 1
with open(one_v, "r", encoding="utf-8") as f:
total_str = f.read()
total_str = "".join(total_str.split())
one_data = list(jieba.cut(total_str))
documents = []
text = ""
for one in one_data:
text += one
if text not in total_str[len("".join(documents)) + len(text):]:
documents.append(text)
text = ""
total_l.append(documents)
pd.to_pickle({"voc": total_l},
out_dir + "/{}{}.pandas_pickle_data_set".format(process_count, one_p_count))
def gen_voc_data_to_tensor_set(self, paths_list_dir, out_dir, works_num=8):
"""
唯一长度拆分
:param paths_list_dir: 多个txt 的文件夹
:param works_num:
:return:
"""
paths_list_pr = glob(pathname=paths_list_dir + "*")
p_list = []
# 发任务到异步进程
for i in range(0, len(paths_list_pr), len(paths_list_pr) // works_num):
j = len(paths_list_pr) // works_num + i
p = Process(target=self.gen_data_tensor, args=(
paths_list_pr[i:j], out_dir, i))
p.start()
p_list.append(p)
for p in p_list:
p.join()
@staticmethod
def init_data_set(paths_list_dir):
paths_list_pr = glob(pathname=paths_list_dir + "*")
analyzer = ChineseAnalyzer()
schema = Schema(title=TEXT(stored=True, analyzer=analyzer), content=TEXT(stored=True, analyzer=analyzer),
id=ID(stored=True))
if not os.path.exists("index"):
os.mkdir("index")
with create_in("index", schema, indexname='article_index') as ix:
# documents = [
# {
# "title": "下文",
# "content": "首先安装jieba和whoosh库,",
# "id": "1"
# },
# {
# "title": "中文自然语言处理",
# "content": "中文自然语言处理涉及分词、词性标注、命名实体识别等...",
# "id": "2"
# }
# ]
writer = ix.writer()
total_count_id = 0
for one_p in paths_list_pr:
documents = pd.read_pickle(one_p)
for doc in tqdm(documents["voc"]):
for doc_i, doc_j in zip(doc[1:], doc[:-1]):
writer.add_document(title=doc_i, content=doc_j, id=str(total_count_id))
total_count_id += 1
writer.commit()
@staticmethod
def add_data_set(paths_list_dir):
paths_list_pr = glob(pathname=paths_list_dir + "*")
with open_dir("indexdir", indexname='article_index') as ix:
writer = ix.writer()
total_count_id = 0
for one_p in paths_list_pr:
documents = pd.read_pickle(one_p)
for doc in tqdm(documents["voc"]):
for doc_i, doc_j in zip(doc[1:], doc[:-1]):
writer.add_document(title=doc_i, content=doc_j, id=str(total_count_id))
total_count_id += 1
writer.commit()
@staticmethod
def search_by_jieba_world(search_text):
ix = open_dir("index", indexname='article_index')
with ix.searcher() as searcher:
query_parser = QueryParser("content", schema=ix.schema)
search_input = search_text
query = query_parser.parse(search_input)
results = searcher.search(query, limit=None)
print(f"找到 {len(results)} 篇相关文档:")
for result in results:
print(f"{result['id']} - {result['title']}")
return results
if __name__ == '__main__':
freeze_support()
txt_p = "E:/just_and_sum/data_sets/"
gvt_fds = GenVocTensorForDataSet()
# 生成分词库
# gvt_fds.gen_voc_data_to_tensor_set(txt_p, "E:/just_and_sum/data_set_d",works_num=8)
# 初始化数据库
# data_base = gvt_fds.init_data_set("E:/just_and_sum/data_set_d/")
# 搜索
search_res = gvt_fds.search_by_jieba_world("头孢克洛头孢泊肟酯是同")
print(search_res)