Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More prebuilt indexes features #235

Merged
merged 10 commits into from
Sep 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ It currently supports:
+ robust04 (TREC Disks 4 & 5)
+ ms-marco-passage (MS MARCO Passage)
+ ms-marco-doc (MS MARCO Doc)
+ enwiki-paragraphs (English Wikipedia)
+ zhwiki-paragraphs (Chinese Wikipedia)

## How Do I Fetch a Document?

Expand Down
23 changes: 23 additions & 0 deletions pyserini/index/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from ..analysis import get_lucene_analyzer, JAnalyzer, JAnalyzerUtils
from ..pyclass import autoclass, JString
from ..search import Document
from pyserini.util import download_prebuilt_index, get_indexes_info

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -150,6 +151,28 @@ def __init__(self, index_dir):
self.object = JIndexReader()
self.reader = self.object.getReader(JString(index_dir))

@classmethod
def from_prebuilt_index(cls, prebuilt_index_name: str):
"""Build an index reader from the prebuilt index, download the index if necessary.

Parameters
----------
prebuilt_index_name : str
Prebuilt index name.

Returns
-------
IndexReader
Index reader built from the prebuilt index.
"""
index_dir = download_prebuilt_index(prebuilt_index_name)
return cls(index_dir)

@staticmethod
def list_prebuilt_indexes():
"""Display available prebuilt indexes' information."""
get_indexes_info()

def analyze(self, text: str, analyzer=None) -> List[str]:
"""Analyze a piece of text. Applies Anserini's default Lucene analyzer if analyzer not specified.

Expand Down
74 changes: 74 additions & 0 deletions pyserini/indexInfo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
INDEX_INFO = {
"robust04": {
"name": "robust04",
"description": "TREC Disks 4 & 5 (TREC 2004 Robust Track)",
"url": {"uwaterloo": "https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-robust04-20191213.tar.gz"},
"md5": "15f3d001489c97849a010b0a4734d018",
"downloaded": False,
"size compressed": "1821814915 bytes",
"size uncompressed": "2172142080 bytes",
"total_terms": 174540872,
"documents": 528030,
"non_empty_documents": 528030,
"unique_terms": 923436},
"trec45": {
"name": "trec45",
"description": "TREC Disks 4 & 5 (TREC 2004 Robust Track)",
"url": {"uwaterloo": "https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-robust04-20191213.tar.gz"},
"md5": "15f3d001489c97849a010b0a4734d018",
"downloaded": False,
"size compressed": "1821814915 bytes",
"size uncompressed": "2172142080 bytes",
"total_terms": 174540872,
"documents": 528030,
"non_empty_documents": 528030,
"unique_terms": 923436},
"ms-marco-passage": {
"name": "ms-marco-passage",
"description": "MS MARCO Passage Dataset",
"url": {"uwaterloo": "https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-msmarco-passage-20191117-0ed488.tar.gz"},
"md5": "3c2ef64ee6d0ee8e317adcb341b92e28",
"downloaded": False,
"size compressed": "2153209812 bytes",
"size uncompressed": "2675783168 bytes",
"total_terms": 352316036,
"documents": 8841823,
"non_empty_documents": 8841823,
"unique_terms": -1},
"ms-marco-doc": {
"name": "ms-marco-doc",
"description": "MS MARCO Doc Dataset",
"url": {"dropbox": "https://www.dropbox.com/s/awukuo8c0tkl9sc/index-msmarco-doc-20200527-a1ecfa.tar.gz?dl=1"},
"md5": "72b1a0f9a9094a86d15c6f4babf8967a",
"downloaded": False,
"size compressed": "13661943256 bytes",
"size uncompressed": "16769683456 bytes",
"total_terms": 2748636047,
"documents": 3213835,
"non_empty_documents": 3213835,
"unique_terms": -1},
"enwiki-paragraphs": {
"name": "lucene-index.enwiki-20180701-paragraphs",
"description": "English Wikipedia",
"url": {"dropbox": "https://www.dropbox.com/s/b7qqaos9ot3atlp/lucene-index.enwiki-20180701-paragraphs.tar.gz?dl=1"},
"md5": "77d1cd530579905dad2ee3c2bda1b73d",
"downloaded": False,
"size compressed": "17725958785 bytes",
"size uncompressed": "21854924288 bytes",
"total_terms": 1498980668,
"documents": 39880064,
"non_empty_documents": 39879903,
"unique_terms": -1},
"zhwiki-paragraphs": {
"name": "lucene-index.zhwiki-20181201-paragraphs",
"description": "Chinese Wikipedia",
"url": {"dropbox": "https://www.dropbox.com/s/6zn16mombt0wirs/lucene-index.zhwiki-20181201-paragraphs.tar.gz?dl=1"},
"md5": "c005af4036296972831288c894918a92",
"downloaded": False,
"size compressed": "3284531213 bytes",
"size uncompressed": "3893332992 bytes",
"total_terms": 320776789,
"documents": 4170312,
"non_empty_documents": 4170301,
"unique_terms": -1}
}
19 changes: 18 additions & 1 deletion pyserini/search/_searcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from pyserini.pyclass import autoclass, JString, JArrayList
from pyserini.trectools import TrecRun
from pyserini.fusion import FusionMethod, reciprocal_rank_fusion
from pyserini.util import download_prebuilt_index
from pyserini.util import download_prebuilt_index, get_indexes_info

logger = logging.getLogger(__name__)

Expand All @@ -51,9 +51,26 @@ def __init__(self, index_dir: str):

@classmethod
def from_prebuilt_index(cls, prebuilt_index_name: str):
"""Build a searcher from the prebuilt index, download the index if necessary.

Parameters
----------
prebuilt_index_name : str
Prebuilt index name.

Returns
-------
SimpleSearcher
Searcher built from the prebuilt index.
"""
index_dir = download_prebuilt_index(prebuilt_index_name)
return cls(index_dir)

@staticmethod
def list_prebuilt_indexes():
"""Display available prebuilt indexes' information."""
get_indexes_info()

def search(self, q: Union[str, JQuery], k: int = 10, query_generator: JQueryGenerator = None, strip_segment_id=False, remove_dups=False) -> List[JSimpleSearcherResult]:
"""Search the collection.

Expand Down
51 changes: 26 additions & 25 deletions pyserini/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,25 +21,8 @@
import tarfile
from tqdm import tqdm
from urllib.request import urlretrieve

INDEX_INFO = {
'index-marco-passage': {
'urls': {'uwaterloo': 'https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-msmarco-passage-20191117-0ed488.tar.gz'},
'md5': '3c2ef64ee6d0ee8e317adcb341b92e28'},
'index-marco-doc': {
'urls': {'dropbox': 'https://www.dropbox.com/s/awukuo8c0tkl9sc/index-msmarco-doc-20200527-a1ecfa.tar.gz?dl=1'},
'md5': '72b1a0f9a9094a86d15c6f4babf8967a'},
'index-robust04': {
'urls': {'uwaterloo': 'https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-robust04-20191213.tar.gz'},
'md5': '15f3d001489c97849a010b0a4734d018'}
}

INDEX_MAPPING = {
'ms-marco-passage': INDEX_INFO['index-marco-passage'],
'ms-marco-doc': INDEX_INFO['index-marco-doc'],
'trec45': INDEX_INFO['index-robust04'],
'robust04': INDEX_INFO['index-robust04']
}
import pandas as pd
from pyserini.indexInfo import INDEX_INFO


# https://gist.github.com/leimao/37ff6e990b3226c2c9670a2cd1e4a6f5
Expand Down Expand Up @@ -99,7 +82,7 @@ def download_and_unpack_index(url, index_directory='indexes', force=False, verbo

if prebuilt:
index_directory = os.path.join(get_cache_home(), 'indexes')
index_path = os.path.join(index_directory, f'{index_name}{md5}')
index_path = os.path.join(index_directory, f'{index_name}.{md5}')
local_tarball = os.path.join(index_directory, f'{index_name}.tar.gz')
if not os.path.exists(index_directory):
os.makedirs(index_directory)
Expand Down Expand Up @@ -139,15 +122,33 @@ def download_and_unpack_index(url, index_directory='indexes', force=False, verbo
os.rename(os.path.join(index_directory, f'{index_name}'), index_path)
return index_path

def check_downloaded(index_name):
mirror = next(iter(INDEX_INFO[index_name]["url"]))
index_url = INDEX_INFO[index_name]["url"][mirror]
index_md5 = INDEX_INFO[index_name]["md5"]
index_name = index_url.split('/')[-1]
index_name = re.sub('''.tar.gz.*$''', '', index_name)
index_directory = os.path.join(get_cache_home(), 'indexes')
index_path = os.path.join(index_directory, f'{index_name}.{index_md5}')
return os.path.exists(index_path)

def get_indexes_info():
indexDf = pd.DataFrame.from_dict(INDEX_INFO)
for index in indexDf.keys():
indexDf[index]['downloaded'] = check_downloaded(index)
with pd.option_context('display.max_rows', None, 'display.max_columns', \
None, 'display.max_colwidth', -1, 'display.colheader_justify', 'left'):
print(indexDf)

def download_prebuilt_index(index_name, force=False, verbose=True, mirror=None):
if index_name in INDEX_MAPPING:
if index_name in INDEX_INFO:
if not mirror:
mirror = next(iter(INDEX_MAPPING[index_name]["urls"]))
elif mirror not in INDEX_MAPPING[index_name]["urls"]:
mirror = next(iter(INDEX_INFO[index_name]["url"]))
elif mirror not in INDEX_INFO[index_name]["url"]:
raise ValueError("unrecognized mirror name {}".format(mirror))
index_url = INDEX_MAPPING[index_name]["urls"][mirror]
index_md5 = INDEX_MAPPING[index_name]["md5"]
index_url = INDEX_INFO[index_name]["url"][mirror]
index_md5 = INDEX_INFO[index_name]["md5"]
return download_and_unpack_index(index_url, prebuilt=True, md5=index_md5)
else:
raise ValueError("unrecognized index name {}".format(index_name))