Source code for pymedextcore.document

import uuid
import json
from .annotators import Annotator, Annotation, Relation

[docs]class Document: """ Document is the main class of pymedext. It is use to load file and annotate them with annotators """ def __init__(self, raw_text, ID=None, attributes=None, source=None, pathToconfig=None, documentDate=None): """create a Document object :param raw_text: raw_text of the doc. if raw_text = load will load a json PyMedExt and transform it back to a Document object :param ID: The document name :param attributes: Dict of attributes related to the document (e.g., person_id). :param source: not use yet but could be the source name I2B2, OMOP HEGP... :param pathToconfig: in case of (raw_text = load), it is a list which contains path to each PyMedExt file (could be use directly to filter) :returns: Document :rtype: Document """ self.documentDate = documentDate self.attributes = attributes self.source = source self.relations = [] if raw_text != "load": self.annotations = [Annotation(type="raw_text", value=raw_text, source_ID=ID, source=source, span=(0, len(raw_text)))] self.ID = str(uuid.uuid1()) self.source_ID = ID else: self.ID = None self.annotations = [] self.source_ID = None for thisPath in pathToconfig: self.load_annotations_files(thisPath)
[docs] def load_annotations_files(self, pathToconfig): """Transform json Pymedext to Document :param pathToconfig: list of path to json files, :returns: add annotations to Document :rtype: Document """ with open(pathToconfig) as f: mesannotations = json.load(f) for annot in mesannotations["annotations"]: # print("annot[value]", annot["value"]) # print("type(annot[value])", type(annot["value"])) if "empty" not in annot["value"]: # print("empty not in annot[value]") if "raw_text" in annot["type"]: if self.ID == None: self.ID = annot["id"] if self.source_ID == None: self.source_ID = annot["source_ID"] if self.source == None: self.source = annot["source"] self.annotations.insert(0, Annotation(type=annot["type"], value=annot["value"], source_ID=annot["source_ID"], ID=annot["id"], source=annot["source"], span=annot["span"])) else: self.annotations.append(Annotation(type=annot["type"], value=annot["value"], source_ID=annot["source_ID"], ID=annot["id"], source=annot["source"], span=annot["span"], attributes=annot["attributes"], isEntity=annot["isEntity"], ngram=annot["ngram"])) for relation in mesannotations['relations']: self.relations.append(Relation(type=relation['type'], head=relation['head'], target=relation['target'], ID=relation['id'], source_ID=relation['source_ID'], source=relation['source']))
[docs] def annotate(self, annotator): """Main function to annotate Document :param annotator: annotators list :returns: run _annotate which add annotations to Document :rtype: Document """ if type(annotator) == Annotator: annotator = [annotator] for ann in annotator: self._annotate(ann)
def _annotate(self, annotator): """ Hidden function to annotate document :param annotator: an annotator :returns: add annotations to a document :rtype: Document """ new_annotations = annotator.annotate_function(self) # print(new_annotations) if new_annotations is not None: for annot in new_annotations: if isinstance(annot, Annotation): self.annotations.append(annot) elif isinstance(annot, Relation): self.relations.append(annot) elif isinstance(annot, tuple) and (any(isinstance(el, Relation) for el in annot) or any( isinstance(el, Annotation) for el in annot)): for obj in annot: if isinstance(obj, Relation): self.relations.append(obj) if isinstance(obj, Annotation): self.annotations.append(obj) else: raise TypeError( "New annotations must be of type Annotation or Relation, or a tuple of Annotation - Relation") # setattr(self, annotator.key_output ,annotator.annotate_function(self))
[docs] def to_json(self): """ transform annotations to a json :returns: transform annotation to json :rtype: json """ return json.dump(self.to_dict())
[docs] def to_dict(self): """transform Document to dict PyMedExt TODO: Need to add the Document Date if available, the processing date, the annotators used :returns: json PyMedExt :rtype: dict """ return {'annotations': [x.to_dict() for x in self.annotations], 'relations': [x.to_dict() for x in self.relations], 'ID': self.ID, 'source_ID': self.source_ID, 'attributes': self.attributes, 'documentDate': self.documentDate }
[docs] @staticmethod def from_dict(d): """Create a Document from a dict of document (as created using to_dict) :param d: Dict :returns: Document :rtype: Document """ doc = Document(raw_text='') for k, v in d.items(): if k != 'annotations' and k != 'relations': setattr(doc, k, v) elif k == 'annotations': doc.annotations = [] for ann in v: doc.annotations.append(Annotation(**ann)) elif k == 'relations': doc.relations = [] for relation in v: doc.relations.append(Relation(**relation)) return doc
[docs] def write_json(self, pathToOutput): """Transform Document to json PyMedExt :param pathToOutput: path to result file :returns: none :rtype: none """ with open(pathToOutput, 'w', encoding='utf-8') as f: json.dump(self.to_dict(), f, ensure_ascii=False, indent=4)
[docs] def get_annotations(self, _type, source_id=None, target_id=None, attributes=None, value=None, span=None): """ returns an annotations of a specific type from source. Can filter from type, source_id or target_id, span, source_id, attributes and value. :param _type: annotation type :param source_id: annotation source id :param target_id: annotation target id :param attributes: :param value: :param span: :return: """ res = [] for anno in self.annotations: if source_id is not None: if anno.source_ID == source_id: res.append(anno) if target_id is not None: if anno.ID == target_id: res.append(anno) if attributes is not None: if anno.attributes == attributes: res.append(anno) if value is not None: if anno.value == value: res.append(anno) if span is not None: if anno.span == span: res.append(anno) if anno.type == _type: res.append(anno) return res
[docs] def get_relations(self, _type=None, head_id=None, target_id=None): """ returns relations of a specific type from source. Can filter from type, head_id or target_id. :param _type: annotation type :param head_id: annotation source id :param target_id: annotation target id :return: """ if self.relations == []: return [] res = [] for relation in self.relations: if _type is not None: if relation.type != _type: continue if head_id is not None: if relation.head != head_id: continue if target_id is not None: if != target_id: continue res.append(relation) return res
[docs] def raw_text(self): """return the Document raw_text :returns: raw_text :rtype: string """ annot = self.get_annotations('raw_text')[0] return annot.value
[docs] def get_graph(self): """return the graph associated with the raw_text :returns: :rtype: """ annot = self.get_annotations('raw_text')[0] return annot
[docs] def get_annotation_by_id(self, _id): res = [x for x in self.annotations if x.ID == _id] if res == []: return None else: return res[0]
[docs] def get_relation_by_id(self, _id): res = [x for x in self.relations if x.ID == _id] if res == []: return None else: return res[0]