diff --git a/README.md b/README.md index 5c6dcc7..219741c 100644 --- a/README.md +++ b/README.md @@ -34,12 +34,10 @@ where: ## Usage -```py -from pubtator_loader import PubTatorCorpusReader -dataset_reader = PubTatorCorpusReader('./sample_pubator_input.txt') +```python +from pubtator_loader import from_path, PubTatorDocument -corpus = dataset_reader.load_corpus() -# corpus will be a List[PubtatorDocuments] +corpus: list[PubTatorDocument] = from_path('./sample_pubator_input.txt') for doc in corpus: print(doc) diff --git a/pubtator_loader/__init__.py b/pubtator_loader/__init__.py index 17aeaed..c56a434 100644 --- a/pubtator_loader/__init__.py +++ b/pubtator_loader/__init__.py @@ -1,2 +1,2 @@ -from .models import PubTatorEntity, PubTatorDocument # noqa -from .pubtator_corpus_reader import PubTatorCorpusReader \ No newline at end of file +from .models import PubTatorDocument, PubTatorEntity # noqa +from .pubtator_corpus_reader import PubTatorCorpusReader, from_gz, from_lines, from_path # noqa diff --git a/pubtator_loader/models/pubtator_document.py b/pubtator_loader/models/pubtator_document.py index 2cb28d8..449e718 100644 --- a/pubtator_loader/models/pubtator_document.py +++ b/pubtator_loader/models/pubtator_document.py @@ -1,12 +1,11 @@ -from spacy.language import Language -from spacy.training import offsets_to_biluo_tags from .pubtator_entities import PubTatorEntity -from typing import List +from typing import List, TYPE_CHECKING import re import json -from spacy.tokenizer import Tokenizer -from spacy.util import compile_prefix_regex, compile_suffix_regex +if TYPE_CHECKING: + import spacy.language + import spacy.tokenizer class PubTatorDocument: def __init__(self, id): @@ -83,7 +82,10 @@ def __replace_overlapping_entities(self, span_replacement_fn): self.entities = processed_entities - def __get_custom_tokenizer(self, nlp: Language) -> Tokenizer: + def __get_custom_tokenizer(self, nlp: 'spacy.language.Language') -> 'spacy.tokenizer.Tokenizer': + from spacy.util import compile_prefix_regex, compile_suffix_regex + from spacy.tokenizer import Tokenizer + infix_re = re.compile( r'''[!\"\#\$\%\&\'\(\)\*\+\,\-\.\/ \:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~]''' @@ -97,7 +99,7 @@ def __get_custom_tokenizer(self, nlp: Language) -> Tokenizer: infix_finditer=infix_re.finditer, token_match=None) - def tokenize_and_convert_to_bilou(self, nlp: Language): + def tokenize_and_convert_to_bilou(self, nlp: 'spacy.language.Language'): self.replace_overlapping_entities_w_longest() text = self.get_space_separated_title_and_abstract() # we need to use a custom tokenizer to avoid the alignment issues @@ -113,6 +115,7 @@ def tokenize_and_convert_to_bilou(self, nlp: Language): results = [] sentences_started = 0 + from spacy.training import offsets_to_biluo_tags for token, semantic_type_id, entity_id in zip( document, offsets_to_biluo_tags(document, diff --git a/pubtator_loader/pubtator_corpus_reader.py b/pubtator_loader/pubtator_corpus_reader.py index 36ce686..53d17e7 100644 --- a/pubtator_loader/pubtator_corpus_reader.py +++ b/pubtator_loader/pubtator_corpus_reader.py @@ -1,10 +1,28 @@ from enum import Enum import re +import gzip from . import PubTatorDocument from . import PubTatorEntity +def from_gz(path, mode='rt'): + """Parse a pubtator corpus from a gzip file at the given path.""" + with gzip.open(path, mode=mode) as file: + return from_lines(file) + +def from_path(path): + """Parse a pubtator corpus from a file at the given path.""" + with open(path) as file: + return from_lines(file) + + +def from_lines(lines): + """Parse a pubtator corpus from the given iterable of lines.""" + reader = PubTatorCorpusReader() + return reader.parse_lines(lines) + + class PubTatorCorpusReader: class LineType(Enum): TITLE = 'TITLE' @@ -12,7 +30,7 @@ class LineType(Enum): MENTION = 'MENTION' DOC_SEP = 'DOCUMENT SEPARATOR' - def __init__(self, file_path): + def __init__(self, file_path=None): self.file_path = file_path self.__document_being_read = None self.corpus = [] @@ -37,9 +55,9 @@ def __init__(self, file_path): def load_corpus(self): with open(self.file_path, 'r') as file: lines = file.readlines() - return self.__parse_lines(lines) + return self.parse_lines(lines) - def __parse_lines(self, content_lines): + def parse_lines(self, content_lines): prev_line_type = None for line_number, line in enumerate(content_lines): try: