Source code for pymedextcore.brat_parser

import re
from dataclasses import dataclass
from typing import Dict, List, Sequence, Text, Tuple

from smart_open import open

GROUPING_ENTITIES = frozenset(["And-Group", "Or-Group"])
GROUPING_RELATIONS = frozenset(["And", "Or"])

[docs]def remove_empty(iterable: Sequence[Text]) -> Sequence[Text]: """ Returns only non-empty strings from an iterable. Parameters ========== - iterable : Iterable An iterable of strings that possibly contains empty strings. Returns ======= - The same iterable with the empty strings removed. """ return list(filter(lambda x: len(x.strip()) > 0, iterable))
[docs]def sanitize_tabs(line: str, max_tabs: int = 2) -> str: sanitized_line: List[str] = [] tab_count = 0 for char in line: if char == "\t" and tab_count < max_tabs: sanitized_line.append(char) tab_count += 1 elif char == "\t" and tab_count == max_tabs: sanitized_line.append(" ") else: sanitized_line.append(char) line = "".join(sanitized_line) return line
[docs]@dataclass class Entity(object): """A simple annotation data structure.""" id: str type: str span: Tuple[Tuple[int, int], ...] text: str @property def start(self) -> int: return self.span[0][0] @property def end(self) -> int: return self.span[-1][-1]
[docs]@dataclass class Relation(object): """A simple relation data structure.""" id: str type: str subj: str obj: str
[docs]@dataclass class Attribute(object): """A simple attribute data structure.""" id: str type: str target: str values: Tuple[str, ...] = tuple()
[docs]@dataclass class Grouping(object): id: str type: str items: List[Entity] @property def text(self): return f" {self.type.split('-')[0]} ".join(i.text for i in self.items)
[docs]@dataclass class AugmentedEntity(object): """An augmented entity data structure with its relations and attributes.""" id: str type: str span: Tuple[Tuple[int, int], ...] text: str relations_from_me: Tuple[Relation, ...] relations_to_me: Tuple[Relation, ...] attributes: Tuple[Attribute, ...] @property def start(self) -> int: return self.span[0][0] @property def end(self) -> int: return self.span[-1][-1]
[docs]@dataclass class Document(object): entities: List[Entity] relations: List[Relation] attributes: List[Attribute]
[docs]def parse(ann_path: str) -> Document: entities, relations, attributes = read_file_annotations(ann_path) return Document(list(entities), list(relations), list(attributes))
[docs]def parse_string(annotation_string: str) -> Document: annotations_s = "\n" + annotation_string annotations_s = re.sub(r"^#.+", "", annotations_s, flags=re.MULTILINE) annotations = remove_empty(re.split(r"\n([TRAE]\d+\t)", annotations_s)) entities = list() relations = list() attributes = list() for i in range(0, len(annotations), 2): if annotations[i].startswith("T"): entity = parse_entity(annotations[i], annotations[i + 1]) entities.append(entity) elif annotations[i].startswith("R"): relation = parse_relation(annotations[i], annotations[i + 1]) relations.append(relation) elif annotations[i].startswith("A"): attribute = parse_attribute(annotations[i], annotations[i + 1]) attributes.append(attribute) return Document(entities, relations, attributes)
[docs]def parse_string_to_augmented_entities( annotation_string: str, ) -> Dict[str, AugmentedEntity]: document = parse_string(annotation_string) augmented_entities: Dict[str, AugmentedEntity] = {} for entity in document.entities: entity_id = entity_relations_from_me = [] entity_relations_to_me = [] entity_attributes = [] for relation in document.relations: if relation.subj == entity_id: entity_relations_from_me.append(relation) if relation.obj == entity_id: entity_relations_to_me.append(relation) for attribute in document.attributes: if == entity_id: entity_attributes.append(attribute) augmented_entities[] = AugmentedEntity(, type=entity.type, span=entity.span, text=entity.text, relations_from_me=tuple(entity_relations_from_me), relations_to_me=tuple(entity_relations_to_me), attributes=tuple(entity_attributes), ) return augmented_entities
[docs]def get_augmented_entities(ann_path: str) -> Dict[str, AugmentedEntity]: entities, relations, attributes, _ = get_entities_relations_attributes_groups( ann_path ) augmented_entities = {} for entity_id, entity in entities.items(): entity_relations_from_me = [] entity_relations_to_me = [] entity_attributes = [] for _, relation in relations.items(): if relation.subj == entity_id: entity_relations_from_me.append(relation) if relation.obj == entity_id: entity_relations_to_me.append(relation) for _, attribute in attributes.items(): if == entity_id: entity_attributes.append(attribute) augmented_entities[] = AugmentedEntity(, type=entity.type, span=entity.span, text=entity.text, relations_from_me=tuple(entity_relations_from_me), relations_to_me=tuple(entity_relations_to_me), attributes=tuple(entity_attributes), ) return augmented_entities
[docs]def list_to_dict(s: List) -> Dict: return { i for i in s}
[docs]def get_entities_relations_attributes_groups( ann_path: str, ) -> Tuple[ Dict[str, Entity], Dict[str, Relation], Dict[str, Attribute], Dict[str, Grouping], ]: entities_s, relations_s, attributes_s = read_file_annotations(ann_path) entities: Dict[str, Entity] = list_to_dict(entities_s) relations: Dict[str, Relation] = list_to_dict(relations_s) attributes: Dict[str, Attribute] = list_to_dict(attributes_s) # Process Groups grouping_relations = { r for r in relations.values() if r.type in GROUPING_RELATIONS } groups: Dict[str, Grouping] = {} for entity_id, entity in entities.items(): if entity.type in GROUPING_ENTITIES: items: List[Entity] = list() for relation in grouping_relations.values(): if relation.subj == entity_id: items.append(entities[relation.obj]) groups[entity_id] = Grouping(entity_id, entity.type, items) return entities, relations, attributes, groups
[docs]def parse_entity(tag_id: str, tag_content: str) -> Entity: """ Parse the entity string into an Entity structure. Parameters ========== - tag_id : str The Tag ID in the annotation. (`T12\t` for example) - tag_content : str The tag text content. (`Temporal-Modifier 116 126\thistory of` for example) Returns ======= - Entity An Entity object """ tag_content = sanitize_tabs(tag_content, max_tabs=1) try: tag_spans, text = tag_content.strip().split("\t") except Exception as e: # pragma: no cover print(tag_id) raise e tag = tag_spans.split(" ")[0].strip() spans_ = tag_spans[len(tag) :].split(";") spans: List[Tuple[int, int]] = [] for span in spans_: start_s, end_s = span.split() start, end = int(start_s), int(end_s) spans.append((start, end)) return Entity(tag_id.strip(), tag, tuple(spans), text)
[docs]def parse_relation(relation_id: str, relation_content: str) -> Relation: """ Parse the annotation string into a Relation structure. Parameters ========== - relation_id : str The Relation ID in the annotation. (`R12\t` for example) - relation_content : str The relation text content. (`Modified-By Arg1:T8 Arg2:T6\t` for example) Returns ======= - Relation A Relation object """ try: relation, subj, obj = relation_content.strip().split() except Exception as e: print(relation_id) raise e subj = subj.replace("Arg1:", "") obj = obj.replace("Arg2:", "") return Relation(relation_id.strip(), relation, subj, obj)
[docs]def parse_attribute(attribute_id: str, attribute_content: str) -> Attribute: """ Parse the annotation string into an Attribute structure. Parameters ========== - Attribute_id : str The attribute ID in the annotation. (`A1\t` for example) - Attribute_content : str The attribute text content. (`Tense T19 Past-Ended` for example) Returns ======= - Attribute An Attribute object """ attribute_arguments = attribute_content.strip().split(" ") if len(attribute_arguments) < 2: raise ValueError("The input attribute couldn't be parsed.") attribute_name = attribute_arguments[0] attribute_target = attribute_arguments[1] if len(attribute_arguments) == 2: return Attribute(attribute_id.strip(), attribute_name, attribute_target) # elif len(attribute_arguments) > 2: return Attribute( attribute_id.strip(), attribute_name, attribute_target, tuple(attribute_arguments[2:]), )
[docs]def read_file_annotations( ann: str, ) -> Tuple[List[Entity], List[Relation], List[Attribute]]: """ Read an annotation file and get the Entities and Relations in it. Parameters ========== - ann : str The path to the annotation file to be processed. Returns ======= - Tuple[Set[Entity], Set[Relation], Set[Attribute]] A tuple of sets of Entities, Relations, and Attributes. """ ann_content = "" with open(ann, encoding="utf-8") as f: ann_content += document = parse_string(ann_content) return document.entities, document.relations, document.attributes