Source code for pymedextcore.bioctransform


from typing import List, Optional

import bioc
from bioc import biocjson
from .datatransform import DataTransform
from .document import Document
from .annotators import Annotation
import uuid

[docs]class BioC(DataTransform):
[docs] @staticmethod def load_collection(bioc_input: str,format: int =0, is_file: bool = True): """load a bioc collection xml or json. It will return a list of Document object. :param bioc_input: a str path to a bioc file or a bioc input string :param format: xml or to_json type of the bioc file :param is_file: if True bioc_input is path else it is a string :returns: list of Document """ collection = None if format == 0: collection = BioC.__load_collection_xml(bioc_input, is_file) else : collection = BioC.__load_collection_json(bioc_input, is_file) annotations_list=[] raw_text = "" raw_text_ID=str(uuid.uuid1()) # document source = collection.source documents_collection =[] for doc in collection.documents: for passage in doc.passages: raw_text = raw_text + passage.text passage_ID= str(uuid.uuid1()) if "section_type" in passage.infons: passage_attribute = {value:passage.infons[value] for value in passage.infons if value not in ["section_type"] } annotations_list.append( Annotation(type=passage.infons["section_type"], value=passage.text, ngram = passage.text, source_ID=raw_text_ID, ID=passage_ID, source="BioCPassage", span=(passage.offset,passage.offset+len(passage.text))) ) else: annotations_list.append( Annotation(type="BioCPassage", value=passage.text, ngram = passage.text, source_ID=raw_text_ID, ID=passage_ID, source="BioCPassage", span=(passage.offset,passage.offset+len(passage.text))) ) relations_annot_dict=dict() if passage.annotations: for this_annotation in passage.annotations: annotation_ID= str(uuid.uuid1()) identifier =None if "identifier" in this_annotation.infons.keys() : identifier=this_annotation.infons["identifier"] elif "Identifier" in this_annotation.infons.keys() : identifier=this_annotation.infons["Identifier"] this_attributes = {value:this_annotation.infons[value] for value in this_annotation.infons if value not in ["type","identifier","Identifier"] } this_attributes["id"]=this_annotation.id relations_annot_dict[this_annotation.id]=(this_annotation.locations[0].offset,this_annotation.locations[0].offset+ this_annotation.locations[0].length) this_type = str(type(this_annotation)).replace(">","").replace("<","").replace("class ","").replace("bioc.bioc.","").replace("'","") annotations_list.append( Annotation(type=this_annotation.infons["type"], value=identifier, ngram =this_annotation.text, source_ID=passage_ID, ID=annotation_ID, source=this_type, span=(this_annotation.locations[0].offset,this_annotation.locations[0].offset+ this_annotation.locations[0].length), attributes =this_attributes, isEntity=True) ) if passage.relations: for this_relation in passage.relations: annotation_ID= str(uuid.uuid1()) identifier =None if "identifier" in this_relation.infons.keys() : identifier=this_relation.infons["identifier"] elif "Identifier" in this_relation.infons.keys(): identifier=this_relation.infons["Identifier"] this_attributes = {value:this_relation.infons[value] for value in this_relation.infons if value not in ["type","identifier","Identifier"] } this_attributes["id"]=this_relation.id this_type = str(type(this_relation)).replace(">","").replace("<","").replace("class ","").replace("bioc.bioc.","").replace("'","") for refNode in this_relation.nodes: annotations_list.append( Annotation(type=this_relation.infons["type"], value=identifier, ngram ="Null", source_ID=passage_ID, ID=annotation_ID, source=this_type, span= relations_annot_dict[refNode.refid], attributes=this_attributes) ) this_document = Document(raw_text =raw_text,ID =raw_text_ID, source = collection.source, documentDate = collection.date) # attributes=collection.key,collection.standalone, # collection.encoding,collection.version # collection.infons this_document.annotations.extend(annotations_list) documents_collection.append(this_document) return(documents_collection)
[docs] @staticmethod def save_as_collection(list_of_pymedext_documents: List[Document]): """save a list of pymedext document as a bioc collection . It will return a bioc collection object. :param list_of_pymedext_documents: a list of Document :returns: a bioc collection object """ this_bioc_collection = bioc.BioCCollection() for this_pymedext_doc in list_of_pymedext_documents: this_bioc_doc = bioc.BioCDocument() for annot in this_pymedext_doc.annotations: # print(annot.type) print(annot.source) if annot.type == "raw_text": if this_bioc_collection.source =='': this_bioc_collection.source=annot.source if annot.source == "BioCPassage": print(annot.ngram) print(annot.value) this_passage = bioc.BioCPassage() this_passage.text = annot.ngram this_passage.offset = annot.span[0] this_bioc_doc.add_passage(this_passage) # passageAttributes to add elif annot.source =="BioCAnnotation": this_annotation = bioc.BioCAnnotation() this_annotation.infons = annot.attributes this_annotation.id = annot.attributes["id"] this_annotation.text = annot.ngram thisLocation = bioc.BioCLocation(annot.span[0],annot.span[1]-annot.span[0]) this_annotation.add_location(thisLocation) this_bioc_doc.passages[-1].add_annotation(this_annotation) this_bioc_collection.add_document(this_bioc_doc) return(this_bioc_collection)
[docs] @staticmethod def write_bioc_collection(filename:str, collection:bioc.BioCCollection): """write a BiocCollection as an xml document It will return 1 :param filename: a str filename of the collection :param collection: a bioc collection :returns: 1 """ with bioc.BioCXMLDocumentWriter(filename) as writer: writer.write_collection_info(collection) for document in collection.documents: writer.write_document(document) return(1)
def __load_collection_xml(bioc_xml: str, is_file: bool = True): """load a xml bioc collection. It will return a bioc collection object. :param bioc_xml: a str path to a bioc file or a bioc input xml string :param is_file: if True bioc_input is a path else it is a string :returns: a bioc collection object """ if is_file : with open(bioc_xml, 'r') as fp: collection = bioc.load(fp) return(collection) else: collection = bioc.loads(bioc_xml) return(collection) def __load_collection_json(bioc_json: str, is_file: bool =True): """load a json bioc collection . It will return a bioc collection object. :param bioc_json: a str path to a bioc file or a bioc input json string :param is_file: if True bioc_input is a path else it is a string :returns: a bioc collection object """ if is_file: with open(bioc_json, 'r') as fp: collection = biocjson.load(fp) return(collection) else: collection = biocjson.loads(bioc_json) return(collection)