1、给Discuz帖子、文章标题创建索引
2、输入一个关键词,返回与这个关键词相关的帖子和文章
# coding=utf-8
import os,json,time
from whoosh.index import create_in
from whoosh.fields import *
from jieba.analyse import ChineseAnalyzer
from whoosh.qparser import QueryParser
from whoosh import qparser, scoring
from whoosh import index
import MySQLdb as mdb
reload(sys)
sys.setdefaultencoding('utf8')
start = time.time()
con = mdb.connect('127.0.0.1','root','','wddis',charset='utf8',unix_socket='/tmp/mysql.sock')
'''
表信息:
pre_forum_thread.subject&tid --> 存放帖子标题
pre_portal_article_title.title&aid--> 存放文章标题
'''
def new_index_sql():
# 按照schema定义信息,增加需要建立索引的文档
# 注意:字符串格式需要为unicode格式
writer = ix.writer(limitmb=256,procs=4)
cur = con.cursor()
n = 0
a = cur.execute("select count(*) from pre_forum_thread ")
number = int(cur.fetchone()[0])
with con:
cur.execute("select tid,fid,subject from pre_forum_thread")
numrows = int(cur.rowcount)
for i in range(numrows):
row = cur.fetchone()
tid = row[0]
fid = row[1]
title = row[2]
writer.add_document(title=title, tid=tid, fid=fid)
n += 1
percent=float(n)*100/float(number)
sys.stdout.write("-----------> 完成百分比:%.2f" % percent)
sys.stdout.write("%\r")
sys.stdout.flush()
writer.commit()
sys.stdout.flush()
def search_index(words):
with ix.searcher() as s:
# group=qparser.OrGroup 表示可匹配任意查询词,而不是所有查询词都匹配才能出结果
qp = QueryParser('title',schema=ix.schema,group=qparser.OrGroup)
# 下面两行表示可以使用通配符搜索,如”窗前*月光“
qp.remove_plugin_class(qparser.WildcardPlugin)
qp.add_plugin(qparser.PrefixPlugin())
for word in words:
q = qp.parse(u'%s' % word)
results = s.search(q,limit=20)
for i in results:
#print word + "----->" + i["title"],i.highlights("title"),i.score
print word,i['title'],i['tid'],i['fid']
# 使用结巴中文分词
analyzer = ChineseAnalyzer()
# 创建schema, stored为True表示能够被检索
schema = Schema(
title=TEXT(stored=True, analyzer=analyzer),
tid=NUMERIC(stored=True),
fid=NUMERIC(stored=True),
)
# 存储schema信息至'798wd_luntan'目录下
indexdir = '798wd_luntan/'
if not os.path.exists(indexdir):
os.mkdir(indexdir)
try:
ix=index.open_dir(indexdir)
print '>>>>>>>> 已创建索引 <<<<<<<<<<'
except:
print '>>>>>>>> 未创建索引 <<<<<<<<<<'
ix = create_in(indexdir, schema)
new_index_sql()
words = ["容易下款的高炮口子"]
search_index(words)
end = time.time()
print "完成时间: %f s" % (end - start)

转载请注明:思享SEO博客 » whoosh为discuz帖子和文章标题创建索引