# -*- coding: utf-8 -*-
Created 2020/04/14
@author: David BAUDOIN
fonction : creation ou update d'un fichier BRAT a partir d'un dic pymedext
from typing import List, Optional
from .datatransform import DataTransform
import logging
logger = logging.getLogger(__name__)
import uuid
from .brat_parser import read_file_annotations
from .annotators import Annotation, Relation
from .document import Document
[docs]class brat(DataTransform):
[docs] def save_to_brat(list_of_documents: List[Document] = None, folder_path: str = None,
pym_ann_types: List[str] = None, # ["QuickUMLS", "regex"],
brat_entities_in_pym_types: List[str] = None, # ["QuickUMLS", "regex"]
brat_entities_in_pym_types_value: List[str] = None, # ["QuickUMLS", "regex"]
brat_entities_in_pym_att_values: dict = None,
brat_entities_in_pym_att_keys: dict = None,
brat_attributes: dict = None, # {"QuickUMLS": ["negation", "context", "hypothesis"]},
pym_rel_types: List[str] = None, # ["Stanza"]
brat_ents_of_rel_in_pym_rel_type: List[str] = None, # ["Stanza"]
brat_ents_of_rel_in_pym_ent_value: List[str] = None, # ["Stanza"]
brat_ents_of_rel_in_pym_att_values: dict = None, # {"Stanza" : "upos" }
brat_type_of_rel_in_pym_rel_types: List[str] = None, # ["Stanza"]
brat_type_of_rel_in_pym_rel_att_values: dict = None, # {"Stanza": "deprel"}
level_annot: dict = None # {"QuickUMLS : 0, "Stanza" : 1"}
# brat_ents_of_rel_in_pym_ent_att_keys : dict = None, #{"Stanza" : "upos"}
This function will write all Annotations in Brat files at file_path.
It will create (or overwrite) 2 files for each pymedext Documents in documents list input:
- ID.ann: Brat annotation file (with ID = dic_pymedext.id)
- ID.txt: Raw text of the document (with ID = dic_pymedext.id)
It will create (or overwrite) an annotation.conf file.
:param list_of_documents: List of Documents input. Documents should contain same type of annotations
:param folder_path: path in string format. It will store files at this location. Folder needs to be created.
For the other paramters, the extract of this pymedext document will be used in the examples, for a better understanding.
{'type': 'QuickUMLS',
'value': 'oesophagite',
'ngram': None,
'span': (188, 199),
'source': 'QuickUMLS:v1',
'source_ID': '6814e9fa-96f7-11eb-a8c8-0242ac110002',
'isEntity': False,
'attributes': {'hypothesis': 'certain',
'context': 'patient',
'negation': 'aff',
'cui': 'C0014868',
'label': 'oesophagite',
'semtypes': ['T047'],
'score': 1.0,
'snippet': ' La fibroscopie oeso-gastro-duodénale avait révélé une oesophagite peptique de grade II et a permis l’exérèse d’un petit papillome du tiers supérieur de l’œsophage',
'snippet_span': (132, 296)},
'ID': '681c2d82-96f7-11eb-a8c8-0242ac110002'},
{'type': 'regex',
'value': 'grade II',
'ngram': None,
'span': (212, 220),
'source': 'RegexMatcher:v1',
'source_ID': '68155570-96f7-11eb-a8c8-0242ac110002',
'isEntity': True,
'attributes': {'version': 'v1',
'label': 'Grade',
'id_regexp': 'id_grade',
'snippet': '-gastro-duodénale avait révélé une oesophagite peptique de grade II et a permis l’exérèse d’un petit papillome du tiers supérie',
'hypothesis': 'certain',
'context': 'patient',
'negation': 'aff'},
'ID': '682ca3ec-96f7-11eb-a8c8-0242ac110002'},
annotations :
:param pym_ann_types: Pymedext types of annotation selected.
exemple : ['QuickUMLS', 'regex'] -> annotations in Brat will be about this two types of annotations. Depending on the different opitons filled (explained below), different labels will be displayed in brat.
:param brat_entities_in_pym_types : (optional) if brat entities correpond to annotation types in pymedext, this list should be filled.
exemple : ['regex'] -> in brat, for each regex found, 'regex' will be displayed.
With the extract given 'grade II' will be highlighted in the text with the label 'regex'.
:param brat_entities_in_pym_types_value : if brat entities correpond to the value of annotation types in pymedext, this list should be filled.
exemple : ['QuickUMLS'] -> in brat, for each QuickUMLS found, the quickumls annotation value will be displayed.
With the extract given 'oesophagite' will be highlighted in the text with the label 'QuickUMLS'.
:param brat_entities_in_pym_att_values : (optional) if brat entities correspond to annotation attributes values in pymexdext, this dict should be filled. Keys correponds to pymedext annotation type, values correspond to pymedext attributes keys.
exemple : {'regex': 'label'} -> in brat for each regex found, the regex label in attributes will be displayed.
With the extract given 'grade II' will be highlighted in the text with the label 'Grade'.
:param brat_entities_in_pym_att_keys : (optional) if brat entities correspond to annotation attributes keys in pymedext, this dict should be filled. Keys correponds to pymedext annotation type, values correspond to pymedext attributes keys.
exemple : {'regex': 'label'} -> in brat, for each regex found, the string "label" will be diplayed.
With the extract given 'grade II' will be highlighted in the text with the label 'label'.
:param brat_attributes: (optional) Dict with pymedext annotation type as keys, and the correspondant attributes list that should be exported as Brat attributes.
exemple : {"QuickUMLS": ['hypothesis', 'negation', 'context'] -> for each quickumls found, hypothesis, negation and context attribute values will be displayed.
Put "all" as value if you want all the attributes for this annotation type
exemple :{"QuickUMLS": "all"} for each QuickUMLS found, all attributes (semType, CUI code, hypothesis,... will be displayed.)
relations :
:param pym_rel_types: Pymedext types of relation selected.
exemple : ['Stanza'] -> relations in Brat will be about this two types of relations. Depending on the different opitons filled (explained below), different labels will be displayed in brat.
:param brat_ents_of_rel_in_pym_rel_type : (optional) if brat entities of relations correpond to relations types in pymedext, this list should be filled.
:param brat_ents_of_rel_in_pym_ent_value : (optional) if brat entities of relations correpond to relations types in pymedext, this list should be filled.
:return: 1
### Initialisation
all_brat_entities: List[str] = []
all_brat_attributes: dict = {}
if level_annot:
dict_brat_level_entities = {}
### lists for annotations.conf file :
# -- entities
if brat_entities_in_pym_types:
all_brat_entities = all_brat_entities + brat_entities_in_pym_types
if brat_entities_in_pym_att_keys:
all_brat_entities = all_brat_entities + list(brat_entities_in_pym_att_keys.values())
# if brat_entities_in_pym_types_value, entities will be filled progressively in the loop.
# if brat_entities_in_pym_att_values, entities will be filled progressively in the loop.
## -- entities of relations
if brat_ents_of_rel_in_pym_rel_type:
all_brat_entities = all_brat_entities + brat_ents_of_rel_in_pym_rel_type
# if brat_ents_of_rel_in_pym_ent_value, entities will be filled progressively in the loop.
## -- relation types
if pym_rel_types:
dict_id_to_brat_ent_id: dict = {} # keep the relation in a dict for the relation in the second loop
dict_id_to_type_of_entity: dict = {}
dict_type_of_relation: dict = {}
if brat_type_of_rel_in_pym_rel_types:
all_brat_entities = all_brat_entities + brat_type_of_rel_in_pym_rel_types
for el_type in brat_type_of_rel_in_pym_rel_types:
dict_type_of_relation[el_type] = []
# if brat_type_of_rel_in_pym_rel_att_values, relations types will be filled progressively
### Loop over the documents :
for dic_pymedext in list_of_documents:
# ----- Annotation file -----
doc_id = dic_pymedext.ID
brat_annotations: str = ""
instance_annotation = 0
instance_attributes = 0
instance_relation = 0
## Loop over the annotation objects of the document
for annotation in dic_pymedext.annotations:
brat_entity = None
# Entity of annotation
if pym_ann_types and annotation.type in pym_ann_types:
# find the brat entity
if brat_entities_in_pym_types:
brat_entity = annotation.type
if brat_entities_in_pym_types_value:
brat_entity = annotation.value
# fill the conf file progressively for this option
if brat_entity not in all_brat_entities:
if brat_entities_in_pym_att_keys:
if annotation.type in brat_entities_in_pym_att_keys:
attributes: dict = annotation.attributes
brat_entity = brat_entities_in_pym_att_keys[annotation.type]
if brat_entities_in_pym_att_values:
attributes: dict = annotation.attributes
brat_entity = attributes[brat_entities_in_pym_att_values[annotation.type]]
# fill the conf file progressively for this option
if brat_entity not in all_brat_entities:
# Entity of relation
if pym_rel_types and annotation.type in pym_rel_types: # entity of relation
# fill the dict
dict_id_to_brat_ent_id[annotation.ID] = 'T' + str(instance_annotation)
# find the entity
if brat_ents_of_rel_in_pym_rel_type:
brat_entity = annotation.type # 'Stanza'
dict_id_to_type_of_entity[annotation.ID] = brat_entity
if brat_ents_of_rel_in_pym_ent_value:
brat_entity = annotation.value
dict_id_to_type_of_entity[annotation.ID] = brat_entity
# fill the conf file progressively for this option
if brat_entity not in all_brat_entities:
if brat_ents_of_rel_in_pym_att_values:
attributes: dict = annotation.attributes
brat_entity = attributes[
brat_ents_of_rel_in_pym_att_values[annotation.type]] # for Stanza, value of 'upos'
dict_id_to_type_of_entity[annotation.ID] = brat_entity
# fill the conf file progressively for this option
if brat_entity not in all_brat_entities:
if level_annot and annotation.type in level_annot:
dict_brat_level_entities[brat_entity] = level_annot[annotation.type]
if brat_entity:
# -- Writing annotations --
bratline = 'T' + str(instance_annotation) + '\t' + brat_entity + ' ' + str(annotation.span[0]) \
+ ' ' + str(annotation.span[1]) + '\t' + str(annotation.value)
brat_annotations += f"{bratline}\n"
# Dealing with attributes
if brat_attributes and annotation.type in brat_attributes: # if export_attibutes dict is not None, and annotatio
attributes: dict = annotation.attributes
if "all" in brat_attributes[annotation.type]:
for attribute_key in attributes:
if attribute_key not in all_brat_attributes:
all_brat_attributes is like:
{"Family": (
["patient", "family"])
{attribute_key: (annotation.type, [attributes.get(attribute_key)])})
# adding entry for attribute values like "neg" or "aff" for Negation key to the list (2nd in tuple)
elif attributes.get(attribute_key) not in all_brat_attributes.get(attribute_key)[1]:
# -- writing attributes --
bratline = 'A' + str(instance_attributes) + '\t' + attribute_key + ' ' + \
f"T{instance_annotation}" + " " + str(attributes.get(attribute_key))
brat_annotations += f"{bratline}\n"
instance_attributes += 1
for attribute_key in brat_attributes[
annotation.type]: # for each attributes expected to be annoted in the value list
# adding entry for attribute key (like Negation or Family)
if attribute_key not in all_brat_attributes:
{attribute_key: (annotation.type, [attributes.get(attribute_key)])})
# adding entry for attribute values like "neg" or "aff" for Negation key to the list (2nd in tuple)
elif attributes.get(attribute_key) not in all_brat_attributes.get(attribute_key)[1]:
# -- writing attributes --
bratline = 'A' + str(instance_attributes) + '\t' + attribute_key + ' ' + \
f"T{instance_annotation}" + " " + str(attributes.get(attribute_key))
brat_annotations += f"{bratline}\n"
instance_attributes += 1
# -- Increments instance_annotations --
if brat_entity:
instance_annotation += 1
# Dealing with relations
## Loop over the relation objects of the document
if pym_rel_types:
for relation in dic_pymedext.relations:
if relation.type in pym_rel_types:
brat_relation = None
ent_brat_ID_1 = dict_id_to_brat_ent_id[relation.head]
ent_brat_ID_2 = dict_id_to_brat_ent_id[relation.target]
type_ent_1 = dict_id_to_type_of_entity[relation.head]
type_ent_2 = dict_id_to_type_of_entity[relation.target]
if brat_type_of_rel_in_pym_rel_types:
brat_relation = relation.type
if [type_ent_1, type_ent_2] not in dict_type_of_relation[brat_relation]:
dict_type_of_relation[brat_relation].append([type_ent_1, type_ent_2])
if brat_type_of_rel_in_pym_rel_att_values:
attributes: dict = relation.attributes
brat_relation = attributes[brat_type_of_rel_in_pym_rel_att_values[relation.type]]
if brat_relation not in dict_type_of_relation:
dict_type_of_relation[brat_relation] = [[type_ent_1, type_ent_2]]
elif [type_ent_1, type_ent_2] not in dict_type_of_relation[brat_relation]:
dict_type_of_relation[brat_relation].append([type_ent_1, type_ent_2])
if brat_relation:
# -- Writing relations annotations --
bratline_rel = 'R' + str(
instance_relation) + ' ' + brat_relation + ' Arg1:' + ent_brat_ID_1 + ' Arg2:' + ent_brat_ID_2
brat_annotations += f"{bratline_rel}\n"
# -- Increments instance_relation
instance_relation += 1
# writting file
f_brat = open(f"{folder_path}/{doc_id}.ann", 'w')
# print("brat_annotations", brat_annotations)
# ----- raw text -----
# raw_text = dic_pymedext.raw_text()
# raw_text = dic_pymedext.annotations[1]['value']
raw_text = dic_pymedext.annotations[1].value
f_brat = open(f"{folder_path}/{doc_id}.txt", 'w')
# ----- conf file -----
Attributes section is like:
hypothesis Arg:Syntagme, Value:certain|hypothesis
context Arg:Syntagme, Value:family|patient
negation Arg:Syntagme, Value:neg|aff
entities: str = "[entities]\n\n"
for entity in all_brat_entities:
if level_annot:
if entity in dict_brat_level_entities:
level = " " * dict_brat_level_entities[entity]
entities += level + entity + "\n"
entities += f"{entity}\n"
entities += f"{entity}\n"
relations: str = "\n[relations]\n\n"
events: str = "\n[events]\n"
attributes: str = "\n[attributes]\n\n"
if brat_attributes:
for attribute in all_brat_attributes:
list_value = [str(el) for el in all_brat_attributes.get(attribute)[1]]
annotation_type = all_brat_attributes.get(attribute)[0]
# add default value in order to make the attribute multi valuated.
# If not, it will mark the attribute as true
if len(list_value) == 1:
attributes += str(attribute) + "\t" + "Arg:" + str(annotation_type) + ", Value:" + "|".join(
list_value) + "\n"
if pym_rel_types:
if dict_type_of_relation:
for relation in dict_type_of_relation:
for el in dict_type_of_relation[relation]:
relations += relation + ' ' + 'Arg1:' + el[0] + ', ' + 'Arg2:' + el[1] + "\n"
conf_file = entities + relations + events + attributes
f_brat = open(f"{folder_path}/annotation.conf", 'w')
return (1)
[docs] @staticmethod
def load_from_brat(ann_file: str,
txt_file: Optional[str] = None) -> Document:
"""Load annotations from a .ann file in the Brat format
:param ann_file: path to the .ann file
:param txt_file: path to the corresponding .txt file, if None: defaults to replacing .ann by .txt
:returns: Document
:rtype: Document
entities, relations, attributes = read_file_annotations(ann_file)
annotations_list = []
relations_list = []
if txt_file is None:
txt_file = ann_file.replace(".ann", ".txt")
raw_text = open(txt_file, 'r').read()
raw_text_ID = str(ann_file.replace(".ann", ""))
doc = Document(raw_text=raw_text, ID=raw_text_ID, source=ann_file)
raw_id = doc.get_annotations('raw_text')[0].ID
for entity in entities:
for span in entity.span:
ID = entity.id
span=(span[0], span[1]),
for relation in relations:
return (doc)
# def update(dic_pymedext, bratFilePath_ann):
# f_brat = open(bratFilePath_ann, 'r')
# lastline = ''
# for line in f_brat:
# lastline = line
# f_brat.close()
# try:
# instance_brat = int(lastline.split(' ')[0][1:])
# f_brat = open(bratFilePath_ann, 'a')
# for element in dic_pymedext['annotations']:
# bratline = 'T' + str(instance_brat) + ' ' + dic_pymedext['annotations']['type'] + ' ' + str(dic_pymedext['annotations']['span'][0]) \
# + ' ' + str(dic_pymedext['annotations']['span'][0]) + ' ' + str(dic_pymedext['annotations']['value'])
# instance_brat += 1
# f_brat.write(bratline)
# f_brat.write('\n')
# f_brat.close()
# except:
# logger.info('cannot turn into int the value : ' + str(lastline.split(' ')[0]))