Source code for girder_large_image_annotation.utils

import functools
import json
import math
import os
import re
import threading
import weakref

from bson.objectid import ObjectId

from girder import logger
from girder.constants import AccessType, SortDir
from girder.models.file import File
from girder.models.folder import Folder
from girder.models.item import Item

dataFileExtReaders = {
    '.csv': 'read_csv',
    'text/csv': 'read_csv',
    '.xls': 'read_excel',
    '.xlsx': 'read_excel',
    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'read_excel',
    'application/vnd.ms-excel ': 'read_excel',
    'application/msexcel': 'read_excel',
    'application/x-msexcel': 'read_excel',
    'application/x-ms-excel': 'read_excel',
    'application/x-excel': 'read_excel',
    'application/x-dos_ms_excel': 'read_excel',
    'application/xls': 'read_excel',
    'application/x-xls': 'read_excel',
}
scanDatafileRecords = 50
scanAnnotationElements = 5000

_recentPlottableItemDataLock = threading.RLock()
_recentPlottableItemData = {}


@functools.lru_cache(maxsize=250)
def _dfFromFile(fileid, full=False):
    import pandas as pd

    file = File().load(fileid, force=True)
    ext = os.path.splitext(file['name'])[1]
    reader = dataFileExtReaders.get(
        ext, dataFileExtReaders.get(file.get('mimeType'), None))
    if reader == 'read_excel':
        params = {
            'sheet_name': None,
            'usecols': lambda x: 'Unnamed: ' not in str(x),
        }
        try:
            import python_calamine  # noqa

            params['engine'] = 'calamine'
        except Exception:
            pass
        try:
            df = getattr(pd, reader)(File().open(file), **params)
        except Exception:
            if 'engine' in params:
                params.pop('engine')
                df = getattr(pd, reader)(File().open(file), **params)
            else:
                raise
    else:
        df = {'entry': getattr(pd, reader)(File().open(file))}
    df = {
        k: sheet.iloc[:None if full else scanDatafileRecords].to_dict('records')
        for k, sheet in df.items()}
    logger.info(f'Read {len(df)} x {len(next(iter(df.values())))} values from '
                f'{file["name"]} {file["size"]}')
    if len(df) == 1:
        df = next(iter(df.values()))
    return df


[docs] class AnnotationGeoJSON: """ Generate GeoJSON for an annotation via an iterator. """ def __init__(self, annotationId, asFeatures=False, mustConvert=False): """ Return an itertor for converting an annotation into geojson. :param annotatioId: the id of the annotation. No permissions checks are performed. :param asFeatures: if False, return a geojson string. If True, return the features of the geojson. This can be wrapped in `{'type': 'FeatureCollection', 'features: [...output...]}` to make it a full geojson object. :param mustConvert: if True, raise an exception if any annotation elements cannot be converted. Otherwise, skip those elements. """ from ..models.annotation import Annotation from ..models.annotationelement import Annotationelement self._id = annotationId self.annotation = Annotation().load(id=self._id, force=True, getElements=False) self.elemIterator = Annotationelement().yieldElements(self.annotation) self.stage = 'header' self.first = self.annotation['annotation'] self.asFeatures = asFeatures self.mustConvert = mustConvert def __iter__(self): from ..models.annotationelement import Annotationelement self.elemIterator = Annotationelement().yieldElements(self.annotation) self.stage = 'header' return self def __next__(self): if self.stage == 'header': self.stage = 'firstelement' if not self.asFeatures: return '{"type":"FeatureCollection","features":[' if self.stage == 'done': raise StopIteration try: while True: element = next(self.elemIterator) result = self.elementToGeoJSON(element) if result is not None: break if self.mustConvert: msg = f'Element of type {element["type"]} cannot be represented as geojson' raise Exception(msg) prefix = '' if self.stage == 'firstelement': result['properties']['annotation'] = self.first self.stage = 'elements' else: prefix = ',' if not self.asFeatures: return prefix + json.dumps(result, separators=(',', ':')) return result except StopIteration: self.stage = 'done' if not self.asFeatures: return ']}' raise
[docs] def rotate(self, r, cx, cy, x, y, z): if not r: return [x, y, z] cosr = math.cos(r) sinr = math.sin(r) x -= cx y -= cy return [x * cosr - y * sinr + cx, x * sinr + y * sinr + cy, z]
[docs] def circleType(self, element, geom, prop): x, y, z = element['center'] r = element['radius'] geom['type'] = 'Polygon' geom['coordinates'] = [[ [x - r, y - r, z], [x + r, y - r, z], [x + r, y + r, z], [x - r, y + r, z], [x - r, y - r, z], ]]
[docs] def ellipseType(self, element, geom, prop): return self.rectangleType(element, geom, prop)
[docs] def pointType(self, element, geom, prop): geom['type'] = 'Point' geom['coordinates'] = element['center']
[docs] def polylineType(self, element, geom, prop): if element['closed']: geom['type'] = 'Polygon' geom['coordinates'] = [element['points'][:]] geom['coordinates'][0].append(geom['coordinates'][0][0]) if element.get('holes'): for hole in element['holes']: hole = hole[:] hole.append(hole[0]) geom['coordinates'].append(hole) else: geom['type'] = 'LineString' geom['coordinates'] = element['points']
[docs] def rectangleType(self, element, geom, prop): x, y, z = element['center'] width = element['width'] height = element['height'] rotation = element.get('rotation', 0) left = x - width / 2 right = x + width / 2 top = y - height / 2 bottom = y + height / 2 geom['type'] = 'Polygon' geom['coordinates'] = [[ self.rotate(rotation, x, y, left, top, z), self.rotate(rotation, x, y, right, top, z), self.rotate(rotation, x, y, right, bottom, z), self.rotate(rotation, x, y, left, bottom, z), self.rotate(rotation, x, y, left, top, z), ]]
# not represented # heatmap, griddata, image, pixelmap, arrow, rectanglegrid # heatmap could be MultiPoint, griddata could be rectangle with lots of # properties, image and pixelmap could be rectangle with the image id as a # property, arrow and rectangelgrid aren't really supported
[docs] def elementToGeoJSON(self, element): elemType = element.get('type', '') funcName = elemType + 'Type' if not hasattr(self, funcName): return None result = { 'type': 'Feature', 'geometry': {}, 'properties': { k: v if k != 'id' else str(v) for k, v in element.items() if k in { 'id', 'label', 'group', 'user', 'lineColor', 'lineWidth', 'fillColor', 'radius', 'width', 'height', 'rotation', 'normal', } }, } getattr(self, funcName)(element, result['geometry'], result['properties']) if result['geometry']['type'].lower() != element['type']: result['properties']['type'] = element['type'] return result
@property def geojson(self): return ''.join(self)
[docs] class GeoJSONAnnotation: def __init__(self, geojson): if not isinstance(geojson, (dict, list, tuple)): geojson = json.loads(geojson) self._elements = [] self._annotation = {'elements': self._elements} self._parseFeature(geojson) def _parseFeature(self, geoelem): # noqa if isinstance(geoelem, (list, tuple)): for entry in geoelem: self._parseFeature(entry) if not isinstance(geoelem, dict) or 'type' not in geoelem: return if geoelem['type'] == 'FeatureCollection': return self._parseFeature(geoelem.get('features', [])) if geoelem['type'] == 'GeometryCollection' and isinstance(geoelem.get('geometries'), list): for entry in geoelem['geometry']: self._parseFeature({'type': 'Feature', 'geometry': entry}) return if geoelem['type'] in {'Point', 'LineString', 'Polygon', 'MultiPoint', 'MultiLineString', 'MultiPolygon'}: geoelem = {'type': 'Feature', 'geometry': geoelem} element = {k: v for k, v in geoelem.get('properties', {}).items() if k in { 'id', 'label', 'group', 'user', 'lineColor', 'lineWidth', 'fillColor', 'radius', 'width', 'height', 'rotation', 'normal', }} if 'label' in element: if not isinstance(element['label'], dict): element['label'] = {'value': element['label']} element['label']['value'] = str(element['label']['value']) if geoelem.get('properties', {}).get('annotation'): try: self._annotation.update(geoelem['properties']['annotation']) except Exception: pass self._annotation['elements'] = self._elements elemtype = geoelem.get('properties', {}).get('type', '') or geoelem['geometry']['type'] func = getattr(self, elemtype.lower() + 'Type', None) if func is not None: result = func(geoelem['geometry'], element) if isinstance(result, list): self._elements.extend(result) else: self._elements.append(result)
[docs] def circleType(self, elem, result): cx = sum(e[0] for e in elem['coordinates'][0][:4]) / 4 cy = sum(e[1] for e in elem['coordinates'][0][:4]) / 4 try: cz = elem['coordinates'][0][0][2] except Exception: cz = 0 radius = (max(e[0] for e in elem['coordinates'][0][:4]) - min(e[0] for e in elem['coordinates'][0][:4])) / 2 result['type'] = 'circle' result['radius'] = radius result['center'] = [cx, cy, cz] return result
[docs] def ellipseType(self, elem, result): result = self.rectangleType(elem, result) result['type'] = 'ellipse' return result
[docs] def rectangleType(self, elem, result): coor = elem['coordinates'][0] cx = sum(e[0] for e in coor[:4]) / 4 cy = sum(e[1] for e in coor[:4]) / 4 try: cz = elem['coordinates'][0][0][2] except Exception: cz = 0 width = ((coor[0][0] - coor[1][0]) ** 2 + (coor[0][1] - coor[1][1]) ** 2) ** 0.5 height = ((coor[1][0] - coor[2][0]) ** 2 + (coor[1][1] - coor[2][1]) ** 2) ** 0.5 rotation = math.atan2(coor[1][1] - coor[0][1], coor[1][0] - coor[0][0]) result['center'] = [cx, cy, cz] result['width'] = width result['height'] = height result['rotation'] = rotation result['type'] = 'rectangle' return result
[docs] def pointType(self, elem, result): result['center'] = (elem['coordinates'] + [0, 0, 0])[:3] result['type'] = 'point' return result
[docs] def multipointType(self, elem, result): results = [] result['type'] = 'point' for entry in elem['coordinates']: subresult = result.copy() subresult['center'] = (entry + [0, 0, 0])[:3] results.append(subresult) return results
[docs] def polylineType(self, elem, result): if elem.get('type') == 'LineString': return self.linestringType(elem, result) return self.polygonType(elem, result)
[docs] def polygonType(self, elem, result): result['points'] = [(pt + [0])[:3] for pt in elem['coordinates'][0][:-1]] if len(elem['coordinates']) > 1: result['holes'] = [ [(pt + [0])[:3] for pt in loop[:-1]] for loop in elem['coordinates'][1:] ] result['closed'] = True result['type'] = 'polyline' return result
[docs] def multipolygonType(self, elem, result): results = [] result['closed'] = True result['type'] = 'polyline' for entry in elem['coordinates']: subresult = result.copy() subresult['points'] = [(pt + [0])[:3] for pt in entry[0][:-1]] if len(entry) > 1: subresult['holes'] = [ [(pt + [0])[:3] for pt in loop[:-1]] for loop in entry[1:] ] results.append(subresult) return results
[docs] def linestringType(self, elem, result): result['points'] = [(pt + [0])[:3] for pt in elem['coordinates']] result['closed'] = False result['type'] = 'polyline' return result
[docs] def multilinestringType(self, elem, result): results = [] result['closed'] = False result['type'] = 'polyline' for entry in elem['coordinates']: subresult = result.copy() subresult['points'] = [(pt + [0])[:3] for pt in entry] results.append(subresult) return results
[docs] def annotationToJSON(self): return json.dumps(self._annotation)
@property def annotation(self): return self._annotation @property def elements(self): return self._elements @property def elementCount(self): return len(self._elements)
[docs] def isGeoJSON(annotation): """ Check if a list or dictionary appears to contain a GeoJSON record. :param annotation: a list or dictionary. :returns: True if this appears to be GeoJSON """ if isinstance(annotation, list): if len(annotation) < 1: return False annotation = annotation[0] if not isinstance(annotation, dict) or 'type' not in annotation: return False return annotation['type'] in { 'Feature', 'FeatureCollection', 'GeometryCollection', 'Point', 'LineString', 'Polygon', 'MultiPoint', 'MultiLineString', 'MultiPolygon'}
def _cancelPlottableItemData(uuid, newRecord): if uuid is None: return with _recentPlottableItemDataLock: if uuid in _recentPlottableItemData: old = _recentPlottableItemData.pop(uuid) try: old().cancel = True except Exception: pass if len(_recentPlottableItemData) > 7: _recentPlottableItemData.pop(next(iter(_recentPlottableItemData))) _recentPlottableItemData[uuid] = weakref.ref(newRecord)
[docs] class PlottableItemData: maxItems = 1000 maxAnnotationElements = 5000 maxDistinct = 20 allowedTypes = (str, bool, int, float) def __init__(self, user, item, annotations=None, adjacentItems=False, sources=None, compute=None, uuid=None): """ Get plottable data associated with an item. :param user: authenticating user. :param item: the item record. :param annotations: None, a list of annotation ids, or __all__. If adjacent items are included, the most recent annotation with the same name will also be included. :param adjacentItems: if True, include data from other items in the same folder. If __all__, include data from other items even if the data is not present in the current item. :param sources: None for all, or a string with a comma-separated list or a list of strings; when a list, the options are folder, item, annotation, datafile. :param compute: None for none, or a dictionary with keys "columns": a list of columns to include in the computation; if unspecified or an empty list, no computation is done, "function": a string with the name of the function, such as umap, "params": additional parameters to pass to the function. If none of the requiredKeys are compute.(x|y|z), the computation will not be performed. Only rows which have all selected columns present will be included in the computation. :param uuid: An optional uuid to allow cancelling a previous request. If specified and there are any outstanding requests with the same uuid, they may be cancelled to save resources. """ _cancelPlottableItemData(uuid, self) self.user = user self._columns = None self._datacolumns = None self._data = None self._compute = None self.cancel = False try: if len(compute['columns']): self._compute = {'function': 'umap', 'params': { 'random_state': 1, 'n_jobs': 1}} self._compute.update(compute) except Exception: pass if sources and not isinstance(sources, (list, tuple)): sources = sources.split(',') self._sources = tuple(sources) if sources else None if (self._sources and 'annotation' not in self._sources and 'annotationelement' not in self._sources): annotations = None self._fullScan = adjacentItems == '__all__' self._findItems(item, adjacentItems) self._findAnnotations(annotations) self._findDataFiles() self._dataLock = threading.RLock() def _findItems(self, item, adjacentItems=False): """ Find all the large images in the folder. This only retrieves the first self.maxItems entries. If there are at least this many items, a query is stored in self._moreItems. The items are listed in self.items. :param item: the item to use as the base. If adjacentItems is false, this is the entire self.items data set. :param adjacentItems: if truthy, find adjacent items. """ self._columns = None self.item = item self.folder = Folder().load(id=item['folderId'], user=self.user, level=AccessType.READ) self.items = [item] if adjacentItems: query = { 'filters': { '_id': {'$ne': item['_id']}, }, 'sort': [('_id', SortDir.ASCENDING)], } if 'largeImage' in item: query['filters']['largeImage.fileId'] = {'$exists': True} self.items.extend(list(Folder().childItems( self.folder, limit=self.maxItems - 1, **query))) self._moreItems = query if len(self.items) == self.maxItems else None # TODO: find csv/xlsx/dataframe items in the folder def _findAnnotations(self, annotations): """ Find annotations based on a list of annotations ids. For the current item, these are just the listed annotations. For adjacent items, annotations with the same names are located. A maximum of maxItems are examined, so if the number of items in the folder exceeds this, some annotations will not be located. Results are stored in self.annotations, which is a list with one entry per item. Each entry is a list of annotations (without elements) or None if there is no matching annotation for that item. :param annotations: a list of annotation id strings or comma-separated string of annotation ids. """ from ..models.annotation import Annotation self._columns = None if isinstance(annotations, str): annotations = annotations.split(',') self.annotations = None if annotations and len(annotations): self.annotations = [] query = {'_active': {'$ne': False}, 'itemId': self.item['_id']} if annotations[0] != '__all__': query['_id'] = {'$in': [ObjectId(annotId) for annotId in annotations]} self.annotations.append(list(Annotation().find( query, limit=0, sort=[('_version', -1)]))) if not len(self.annotations[0]): self.annotations = None # Find adjacent annotations if annotations and len(self.items) > 1: names = {} for idx, annot in enumerate(self.annotations[0]): if annot['annotation']['name'] not in names: names[annot['annotation']['name']] = idx for adjitem in self.items[1:]: if self.cancel: return query = {'_active': {'$ne': False}, 'itemId': adjitem['_id']} annotList = [None] * len(self.annotations[0]) for annot in Annotation().find(query, limit=0, sort=[('_version', -1)]): if self.cancel: return if annot['annotation']['name'] in names and annotList[ names[annot['annotation']['name']]] is None: annotList[names[annot['annotation']['name']]] = annot self.annotations.append(annotList) def _findDataFiles(self): # noqa """ Find data files inside the current item. For adjacent items, the data file must have the same name or, if the found file is prefixed with the item name excluding the extension, then the adjancant file should be similarly prefixed. Data files must have a known suffix or a known mimetype that can be read by pandas (and pandas must be installed). """ self._itemfilelist = [[]] * len(self.items) try: import pandas as pd # noqa except Exception: return if self._sources and 'filedata' not in self._sources: return names0 = {} for iidx, item in enumerate(self.items): if iidx: self._itemfilelist[iidx] = [None] * len(self._itemfilelist[0]) names = {} for file in Item().childFiles(item): if self.cancel: return try: if (file['_id'] == self.item['largeImage']['fileId'] or file['_id'] == self.item['largeImage'].get('originalId')): continue except Exception: continue ext = os.path.splitext(file['name'])[1] if (ext not in dataFileExtReaders and file.get('mimeType') not in dataFileExtReaders): continue if file['name'].startswith(item['name'].rsplit('.')[0]): base, name = True, file['name'][len(item['name'].rsplit('.')[0]):] else: base, name = False, file['name'] if (base, name) in names: continue if iidx and (base, name) not in names0: continue names[(base, name)] = len(names) if not iidx: self._itemfilelist[0].append(file) else: self._itemfilelist[iidx][names0[(base, name)]] = file if not iidx: names0 = names # Common column keys and titles commonColumns = { 'item.id': 'Item ID', 'item.name': 'Item Name', 'item.description': 'Item Description', 'annotation.id': 'Annotation ID', 'annotation.name': 'Annotation Name', 'annotation.description': 'Annotation Description', 'annotationelement.id': 'Annotation Element ID', 'annotationelement.group': 'Annotation Element Group', 'annotationelement.label': 'Annotation Element Label', 'annotationelement.type': 'Annotation Element Type', 'bbox.x0': 'Bounding Box Low X', 'bbox.y0': 'Bounding Box Low Y', 'bbox.x1': 'Bounding Box High X', 'bbox.y1': 'Bounding Box High Y', 'compute.x': 'Dimension Reduction X', 'compute.y': 'Dimension Reduction Y', 'compute.z': 'Dimension Reduction Z', } computeColumns = {'compute.x', 'compute.y', 'compute.z'}
[docs] def itemNameIDSelector(self, isName, selector): """ Given a data selector that returns something that is either an item id, an item name, or an item name prefix, return the canonical item or id string from the list of known items. :param isName: True to return the canonical name, False for the canonical id. :param selector: the selector to get the initial value. :returns: a function that can be used as an overall selector. """ def itemNameSelector(record, data, row): value = selector(record, data, row) for item in self.items: if str(item['_id']) == value: return item['name'] if item['name'].lower().startswith(value.lower() + '.'): return item['name'] if item['name'].lower() == value.lower(): return item['name'] return value def itemIDSelector(record, data, row): value = selector(record, data, row) for item in self.items: if str(item['_id']) == value: return str(item['_id']) if item['name'].lower().startswith(value.lower() + '.'): return str(item['_id']) if item['name'].lower() == value.lower(): return str(item['_id']) return value return itemNameSelector if isName else itemIDSelector
def _bboxLookupTable(self): self._bboxLookup = {} for srow, x0val in self._datacolumns['bbox.x0'].items(): x0val = int(x0val) y0val = self._datacolumns['bbox.y0'].get(srow) if y0val is None: continue if x0val not in self._bboxLookup: self._bboxLookup[x0val] = {} if y0val not in self._bboxLookup[x0val]: self._bboxLookup[x0val][y0val] = set() self._bboxLookup[x0val][y0val].add(srow)
[docs] def datafileAnnotationElementSelector(self, key, cols): # Max pixel difference for bounding box epsilon = 2 def annotationElementSelector(record, data, row): bbox = [col[1](record, data, row) for col in cols] if 'bbox.x0' not in self._datacolumns or 'bbox.y0' not in self._datacolumns: return None if not hasattr(self, '_bboxLookup'): self._bboxLookupTable() if key in self._datacolumns: for x0val in range(int(math.floor(bbox[0] - epsilon)), int(math.ceil(bbox[0] + epsilon)) + 1): if x0val in self._bboxLookup: for y0val in range(int(math.floor(bbox[1] - epsilon)), int(math.ceil(bbox[1] + epsilon)) + 1): if y0val in self._bboxLookup[x0val]: for srow in self._bboxLookup[x0val][y0val]: if self._datacolumns[key][srow] is not None: for bidx, bkey in enumerate([ 'bbox.x0', 'bbox.y0', 'bbox.x1', 'bbox.y1']): val = self._datacolumns[bkey].get(srow) if val is None or abs(val - bbox[bidx]) > epsilon: break else: return self._datacolumns[key][srow] return None return annotationElementSelector
[docs] @staticmethod def keySelector(mode, key, key2=None): """ Given a pattern for getting data from a dictionary, return a selector that gets that piece of data. :param mode: one of key, key0, keykey, keykey0, key0key, representing key lookups in dictionaries or array indices. :param key: the first key. :param key2: the second key, if needed. :returns: a pair of functions that can be used to select the value from the record and data structure. This takes (record, data, row) and returns a value. The record is the base record used, the data is the base dictionary, and the row is the location in the index. The second function takes (record, data) and returns either None or the number of rows that are present. """ if mode == 'key0key': def key0keySelector(record, data, row): return data[key][row][key2] def key0keyLength(record, data): return len(data[key]) return key0keySelector, key0keyLength if mode == 'keykey0': def keykey0Selector(record, data, row): return data[key][key2][row] def keykey0Length(record, data): return len(data[key][key2]) return keykey0Selector, keykey0Length if mode == 'keykey': def keykeySelector(record, data, row): return data[key][key2] return keykeySelector, None if mode == 'key0': def key0Selector(record, data, row): return data[key][row] def key0Length(record, data): return len(data[key]) return key0Selector, key0Length def keySelector(record, data, row): return data[key] return keySelector, None
[docs] @staticmethod def recordSelector(doctype): """ Given a document type, return a function that returns the main data dictionary. :param doctype: one of folder, item, annotaiton, annotationelement. :returns: a function that takes (record) and returns the data dictionary, if any. """ if doctype == 'annotation': def annotationGetData(record): return record.get('annotation', {}).get('attributes', {}) return annotationGetData if doctype == 'annotationelement': def annotationelementGetData(record): return record.get('user', {}) return annotationelementGetData if doctype == 'datafile': def datafileGetData(record): return record return datafileGetData def getData(record): return record.get('meta', {}) return getData
def _keysToColumns(self, columns, parts, doctype, getData, selector, length): """ Given a selector and appropriate access information, ensure that an appropriate column or columns exist. :param columns: the column dictionary to possibly modify. :param parts: a tuple of values used to construct a key. :param doctype: the base document type. :param getData: a function that, given the document record, returns the data dictionary. :param selector: a function that, given the document record, data dictionary, and row, returns a value. :param length: None or a function that, given the document record and data dictionary, returns the number of rows. """ key = '.'.join(str(v) for v in parts).lower() lastpart = parts[-1] if parts[-1] != '0' or len(parts) == 1 else parts[-2] title = ' '.join(str(v) for v in parts[1:] if v != '0') keymap = { r'(?i)(item|image)_(id|name)$': 'item.name', r'(?i)((low|min)(_|)x|^x1$)': 'bbox.x0', r'(?i)((low|min)(_|)y|^y1$)': 'bbox.y0', r'(?i)((high|max)(_|)x|^x2$)': 'bbox.x1', r'(?i)((high|max)(_|)y|^y2$)': 'bbox.y1', } match = False for k, v in keymap.items(): if re.match(k, lastpart): if lastpart != parts[1]: doctype = f'{doctype}.{parts[1]}' key = v title = self.commonColumns[key] if key == 'item.name': self._ensureColumn( columns, key, title, doctype, getData, self.itemNameIDSelector(True, selector), length) self._ensureColumn( columns, 'item.id', self.commonColumns['item.id'], doctype, getData, self.itemNameIDSelector(False, selector), length) return match = True break added = self._ensureColumn( columns, key, title, doctype, getData, selector, length) if match and added and key.startswith('bbox'): cols = [columns[bkey]['where'][doctype] for bkey in [ 'bbox.x0', 'bbox.y0', 'bbox.x1', 'bbox.y1'] if bkey in columns and doctype in columns[bkey]['where']] if len(cols) == 4: # If we load all of these from annotation elements, use all # available keys: for akey in [col for col in self.commonColumns if col.startswith('annotation')]: if self._datacolumns and akey in self._datacolumns: self._requiredColumns.add(akey) self._ensureColumn( columns, akey, self.commonColumns[akey], '.'.join(doctype.split('.')[:2]), getData, self.datafileAnnotationElementSelector(akey, cols), length) def _ensureColumn(self, columns, keyname, title, doctype, getData, selector, length): """ Ensure that column exists and the selectors are recorded for the doctype. :param columns: the column dictionary to possibly modify. :param keyname: the key to the column. :param title: the title of the column. :param doctype: the base document type. :param getData: a function that, given the document record, returns the data dictionary. :param selector: a function that, given the document record, data dictionary, and row, returns a value. :param length: None or a function that, given the document record and data dictionary, returns the number of rows. :returns: True if the column where record was added. """ if keyname not in columns: columns[keyname] = { 'key': keyname, 'title': title, 'where': {}, 'type': 'number', 'max': None, 'min': None, 'distinct': set(), 'count': 0, } if doctype not in columns[keyname]['where']: columns[keyname]['where'][doctype] = (getData, selector, length) return True return False def _columnsFromData(self, columns, doctype, getData, record): # noqa """ Given a sample record, determine what columns could be read. :param columns: the column dictionary to possibly modify. :param doctype: the base document type. :param getData: a function that, given the document record, returns the data dictionary. :param record: a sample record. """ data = getData(record) for key, value in data.items(): try: if isinstance(value, list): if not len(value): continue if isinstance(value[0], dict): for key2, value2 in value[0].items(): try: if isinstance(value2, (list, dict)): continue selector, length = self.keySelector('key0key', key, key2) self._keysToColumns( columns, ('data', key, '0', key2), doctype, getData, selector, length) except Exception: continue else: selector, length = self.keySelector('key0', key) self._keysToColumns( columns, ('data', key, '0'), doctype, getData, selector, length) elif isinstance(value, dict): for key2, value2 in value.items(): try: if isinstance(value2, list): if not len(value2): continue selector, length = self.keySelector('keykey0', key, key2) self._keysToColumns( columns, ('data', key, key2, '0'), doctype, getData, selector, length) else: selector, length = self.keySelector('keykey', key, key2) self._keysToColumns( columns, ('data', key, key2), doctype, getData, selector, length) except Exception: continue else: selector, length = self.keySelector('key', key) self._keysToColumns( columns, ('data', key), doctype, getData, selector, length) except Exception: continue def _commonColumn(self, columns, keyname, doctype, getData, selector): """ Ensure that column with a commonly used key exists. :param columns: the column dictionary to possibly modify. :param keyname: the key to the column. :param doctype: the base document type. :param getData: a function that, given the document record, returns the data dictionary. :param selector: a function that, given the document record, data dictionary, and row, returns a value. """ title = self.commonColumns[keyname] self._ensureColumn(columns, keyname, title, doctype, getData, selector, None) def _collectRecordRows( # noqa self, record, data, selector, length, colkey, col, recidx, rows, iid, aid, eid, doctype, columns): """ Collect statistics and possible data from one data set. See _collectRecords for parameter details. """ getAid = (aid == '' and (doctype.startswith(('folder', 'datafile.')))) getEid = (eid == '' and (doctype.startswith(('folder', 'datafile.')))) count = 0 for rowidx in range(rows): if self.cancel: return 0 try: value = selector(record, data, rowidx) except Exception: continue if value is None or not isinstance(value, self.allowedTypes) or value == '': continue if col['type'] == 'number': try: value = float(value) except Exception: col['type'] = 'string' col['distinct'] = {str(v) for v in col['distinct']} col['count'] += 1 if col['type'] == 'number': if col['min'] is None: col['min'] = col['max'] = value col['min'] = min(col['min'], value) col['max'] = max(col['max'], value) else: value = str(value) if len(col['distinct']) <= self.maxDistinct: col['distinct'].add(value) if self._datacolumns and colkey in self._datacolumns: if getAid: try: aid = columns['annotation.id']['where'][doctype][1](record, data, rowidx) if aid is None: aid = '' except Exception: pass if getEid: try: eid = columns['annotationelement.id']['where'][doctype][1]( record, data, rowidx) if eid is None: eid = '' except Exception: pass self._datacolumns[colkey][( iid, aid, eid, rowidx if length is not None else -1)] = value if not self._requiredColumns or colkey in self._requiredColumns: count += 1 return count def _collectRecords(self, columns, recordlist, doctype, iid='', aid=''): """ Collect statistics and possibly row values from a list of records. :param columns: the column dictionary to possibly modify. :param recordlist: a list of records to use. :param doctype: the base document type. :param iid: an optional item id to use for determining distinct rows. :param aid: an optional annotation id to use for determining distinct rows. :return: the number of required data entries added to the data collection process. This will be zero when just listing columns. If no required fields were specified, this will be the count of all added data entries. """ count = None eid = '' for colkey, col in columns.items(): if self._datacolumns and colkey not in self._datacolumns: continue for where, (getData, selector, length) in col['where'].items(): if self.cancel: return 0 if doctype != where and not where.startswith(doctype + '.'): continue for recidx, record in enumerate(recordlist): if doctype == 'item': iid = str(record['_id']) elif doctype == 'annotation': aid = str(record['_id']) elif doctype == 'annotationelement': eid = str(record['id']) data = getData(record) try: rows = 1 if length is None else length(record, data) except Exception: continue subcount = self._collectRecordRows( record, data, selector, length, colkey, col, recidx, rows, iid, aid, eid, doctype, columns) if self._datacolumns: if colkey in self._requiredColumns: count = min(count, subcount) if count is not None else subcount else: count = (count or 0) + subcount return count if count is not None else 0 def _collectColumns(self, columns, recordlist, doctype, first=True, iid='', aid=''): """ Collect the columns available for a set of records. :param columns: the column dictionary to possibly modify. :param recordlist: a list of records to use. :param doctype: the base document type. :param first: False if this is not the first page of a multi-page list of records, :param iid: an optional item id to use for determining distinct rows. :param aid: an optional annotation id to use for determining distinct rows. :return: the number of required data entries added to the data collection process. This will be zero when just listing columns. If no required fields were specified, this will be the count of all added data entries. """ getData = self.recordSelector(doctype.split('.', 1)[0]) if doctype == 'item': self._commonColumn(columns, 'item.id', doctype, getData, lambda record, data, row: str(record['_id'])) self._commonColumn(columns, 'item.name', doctype, getData, lambda record, data, row: record['name']) self._commonColumn(columns, 'item.description', doctype, getData, lambda record, data, row: record['description']) if doctype == 'annotation': self._commonColumn(columns, 'annotation.id', doctype, getData, lambda record, data, row: str(record['_id'])) self._commonColumn(columns, 'annotation.name', doctype, getData, lambda record, data, row: record['annotation']['name']) self._commonColumn(columns, 'annotation.description', doctype, getData, lambda record, data, row: record['annotation']['description']) if doctype == 'annotationelement': self._commonColumn(columns, 'annotationelement.id', doctype, getData, lambda record, data, row: str(record['id'])) self._commonColumn(columns, 'annotationelement.group', doctype, getData, lambda record, data, row: record['group']) self._commonColumn(columns, 'annotationelement.label', doctype, getData, lambda record, data, row: record['label']['value']) self._commonColumn(columns, 'annotationelement.type', doctype, getData, lambda record, data, row: record['type']) self._commonColumn(columns, 'annotation.id', doctype, getData, lambda record, data, row: str(record['_aid'])) self._commonColumn(columns, 'annotation.name', doctype, getData, lambda record, data, row: str(record['_aname'])) self._commonColumn(columns, 'bbox.x0', doctype, getData, lambda record, data, row: record['_bbox']['lowx']) self._commonColumn(columns, 'bbox.y0', doctype, getData, lambda record, data, row: record['_bbox']['lowy']) self._commonColumn(columns, 'bbox.x1', doctype, getData, lambda record, data, row: record['_bbox']['highx']) self._commonColumn(columns, 'bbox.y1', doctype, getData, lambda record, data, row: record['_bbox']['highy']) if first or self._fullScan or doctype != 'item': for record in recordlist[:None if self._fullScan else 1]: self._columnsFromData(columns, doctype, getData, record) return self._collectRecords(columns, recordlist, doctype, iid, aid) def _getColumnsFromAnnotations(self, columns): """ Collect columns and data from annotations. """ from ..models.annotationelement import Annotationelement count = 0 countsPerAnnotation = {} for iidx, annotList in enumerate(self.annotations or []): iid = str(self.items[iidx]['_id']) for anidx, annot in enumerate(annotList): if self.cancel: return 0 # This had been checking if the first item's annotation didn't # contribute any required data to the data set, skip subsequent # items' annotations; they are likely to be discarded. This # is untrue if datafiles or folder level data augments the # element records # if iidx and not countsPerAnnotation.get(anidx, 0) and not self._fullScan: # continue startcount = count if annot is None: continue if not self._sources or 'annotation' in self._sources: count += self._collectColumns(columns, [annot], 'annotation', iid=iid) # add annotation elements if ((not self._sources or 'annotationelement' in self._sources) and Annotationelement().countElements(annot) <= self.maxAnnotationElements): for element in Annotationelement().yieldElements(annot, bbox=True): element['_aid'] = annot['_id'] element['_aname'] = annot['annotation']['name'] count += self._collectColumns( columns, [element], 'annotationelement', iid=iid, aid=str(annot['_id'])) if not iidx: countsPerAnnotation[anidx] = count - startcount return count def _getColumnsFromDataFiles(self, columns): """ Collect columns and data from data files in items. """ if not len(self._itemfilelist) or not len(self._itemfilelist[0]): return 0 count = 0 countsPerDataFile = {} for iidx, dfList in enumerate(self._itemfilelist or []): iid = str(self.items[iidx]['_id']) for dfidx, file in enumerate(dfList): if self.cancel: return 0 # If the first item's data file didn't contribute any required # data to the data set, skip subsequent items' data files; # they are likely to be discarded. if iidx and not countsPerDataFile.get(dfidx, 0) and not self._fullScan: continue startcount = count if file is None: continue if not self._sources or 'datafile' in self._sources: try: df = _dfFromFile(file['_id'], bool(self._datacolumns or self._fullScan)) count += self._collectColumns( columns, [df] if isinstance(df, dict) else df, f'datafile.{dfidx}', iid=iid) except Exception: logger.info( f'Cannot process file {file["_id"]}: {file["name"]} as a dataframe') raise if not iidx: countsPerDataFile[dfidx] = count - startcount return count def _computeFunction(self, rows): if self._compute['function'] == 'umap': import umap logger.info(f'Calling umap on {len(rows)} rows') reducer = umap.UMAP(**self._compute['params']) self._computed = reducer.fit_transform(list(rows.values())) logger.info('Called umap') return True def _getColumnsFromCompute(self, columns): # noqa """ Collect columns and data from compute actions. """ def computeGetData(record): return {} def computeLength(record, data): return len(self._computed) def computeSelector(key): axis = ord(key[-1:]) - ord('x') def computeSelectorAxis(record, data, row): return self._computed[row][axis] return computeSelectorAxis if not self._datacolumns: if len([col for col in columns.values() if col['type'] == 'number']) >= 2: for key in self.computeColumns: title = self.commonColumns[key] self._ensureColumn( columns, key, title, 'compute', computeGetData, computeSelector(key), computeLength) columns[key]['count'] = 1 columns[key]['min'] = columns[key]['max'] = 0 return 0 if self._compute is None or not len(self._requiredColumns & self.computeColumns): return 0 compcol = { key for key, col in columns.items() if col['type'] == 'number' and col.get('min') is not None } & set(self._compute['columns']) if not len(compcol): return 0 rows = {} cols = sorted({col for col in self._compute['columns'] if col in self._datacolumns}) lencols = len(cols) needcols = cols + sorted(set(self._requiredColumns) - set(cols) - self.computeColumns) for kidx, key in enumerate(needcols): for row, value in self._datacolumns[key].items(): if not kidx: rows[row] = [value] elif row in rows and len(rows[row]) == kidx: rows[row].append(value) rows = {k: row for k, row in rows.items() if len(row) == len(needcols)} if not len(rows): return 0 rows = {k: row[:lencols] for k, row in rows.items()} if self.cancel: return 0 if not self._computeFunction(rows): return 0 for key in self.computeColumns: if key in self._requiredColumns and key in self._datacolumns: title = self.commonColumns[key] self._ensureColumn( columns, key, title, 'compute', computeGetData, computeSelector(key), computeLength) cidx = ord(key[-1:]) - ord('x') for ridx, row in enumerate(rows): self._datacolumns[key][row] = float(self._computed[ridx][cidx]) columns[key]['count'] = len(rows) columns[key]['min'] = columns[key]['max'] = 0 return len(rows) def _getColumns(self): """ Get a sorted list of plottable columns with some metadata for each. :returns: a sorted list of data entries. """ count = 0 columns = {} if not self._sources or 'folder' in self._sources: count += self._collectColumns(columns, [self.folder], 'folder') if not self._sources or 'item' in self._sources: count += self._collectColumns(columns, self.items, 'item') if self._moreItems: for item in Folder().childItems( self.folder, offset=len(self.items), **self._moreItems): count += self._collectColumns(columns, [item], 'item', first=False) count += self._getColumnsFromAnnotations(columns) count += self._getColumnsFromDataFiles(columns) count += self._getColumnsFromCompute(columns) for result in columns.values(): if len(result['distinct']) <= self.maxDistinct: result['distinct'] = sorted(result['distinct']) result['distinctcount'] = len(result['distinct']) else: result.pop('distinct', None) if result['type'] != 'number' or result['min'] is None: result.pop('min', None) result.pop('max', None) prefixOrder = { 'item': 0, 'annotation': 1, 'annotationelement': 2, 'data': 3, 'bbox': 4, 'compute': 5} columns = sorted(columns.values(), key=lambda x: ( prefixOrder.get(x['key'].split('.', 1)[0], len(prefixOrder)), x['count'] <= 1, x['title'].lower(), x['key'])) return columns @property def columns(self): """ Get a sorted list of plottable columns with some metadata for each. Each data entry contains :key: the column key. For database entries, this is (item| annotation|annotationelement).(id|name|description|group| label). For bounding boxes this is bbox.(x0|y0|x1|y1). For data from meta / attributes / user, this is data.(key)[.0][.(key2)][.0] :type: 'string' or 'number' :title: a human readable title :count: the number of non-null entries in the column :[distinct]: a list of distinct values if there are less than some maximum number of distinct values. This might not include values from adjacent items :[distinctcount]: if distinct is populated, this is len(distinct) :[min]: for number data types, the lowest value present :[max]: for number data types, the highest value present :returns: a sorted list of data entries. """ if self._columns is not None: return self._columns columns = self._getColumns() self._columns = columns return [{k: v for k, v in c.items() if k != 'where'} for c in self._columns if c['count']] def _collectData(self, rows, colsout): """ Get data rows and columns. :param rows: a list of row id tuples. :param colsout: a list of output columns. :returns: a data array and an updated row list. """ data = [[None] * len(colsout) for _ in range(len(rows))] discard = set() for cidx, col in enumerate(colsout): colkey = col['key'] if colkey in self._datacolumns: datacol = self._datacolumns[colkey] for ridx, rowid in enumerate(rows): value = datacol.get(rowid, None) if value is None and rowid[3] != -1: value = datacol.get((rowid[0], rowid[1], rowid[2], -1), None) if value is not None: discard.add((rowid[0], rowid[1], rowid[2], -1)) if value is None and (rowid[3] != -1 or rowid[2]): value = datacol.get((rowid[0], rowid[1], '', -1), None) if value is not None: discard.add((rowid[0], rowid[1], '', -1)) if value is None and (rowid[3] != -1 or rowid[2] or rowid[1]): value = datacol.get((rowid[0], '', '', -1), None) if value is not None: discard.add((rowid[0], '', '', -1)) if value is None and (rowid[3] != -1 or rowid[2] or rowid[1] or rowid[0]): value = datacol.get(('', '', '', -1), None) if value is not None: discard.add(('', '', '', -1)) data[ridx][cidx] = value if len(discard): data = [row for ridx, row in enumerate(data) if rows[ridx] not in discard] rows = [row for ridx, row in enumerate(rows) if rows[ridx] not in discard] return data, rows
[docs] def data(self, columns, requiredColumns=None): # noqa """ Get plottable data. :param columns: the columns to return. Either a list of column names or a comma-delimited string. :param requiredColumns: only return data rows where all of these columns are non-None. Either a list of column names of a comma-delimited string. """ if not isinstance(columns, list): columns = columns.split(',') if not isinstance(requiredColumns, list): requiredColumns = requiredColumns.split(',') if requiredColumns is not None else [] specifiedReqColumns = set(requiredColumns) self._requiredColumns = set(requiredColumns) if self._compute: if ('compute.z' in specifiedReqColumns and self._compute['function'] == 'umap' and 'n_components' not in self._compute['params']): self._compute['params']['n_components'] = 3 self._requiredColumns.update(self._compute['columns']) with self._dataLock: self._datacolumns = {c: {} for c in columns} rows = set() # collects data as a side effect collist = self._getColumns() if self.cancel: return for coldata in self._datacolumns.values(): rows |= set(coldata.keys()) rows = sorted(rows) colsout = [col.copy() for col in collist if col['key'] in columns] for cidx, col in enumerate(colsout): col['index'] = cidx logger.info(f'Gathering {len(colsout)} x {len(rows)} data') data, rows = self._collectData(rows, colsout) self._datacolumns = None if hasattr(self, '_bboxLookup'): logger.info(f'Bounding boxes: {sum(len(x) for x in self._bboxLookup.values())}') for cidx, col in enumerate(colsout): colkey = col['key'] numrows = len(data) if colkey in specifiedReqColumns: data = [row for row in data if row[cidx] is not None] if len(data) < numrows: logger.info(f'Reduced row count from {numrows} to {len(data)} ' f'because of None values in column {colkey}') subdata = data for cidx, col in enumerate(colsout): colkey = col['key'] numrows = len(subdata) if colkey in self._requiredColumns and colkey not in specifiedReqColumns: subdata = [row for row in subdata if row[cidx] is not None] if len(subdata) < numrows: logger.info(f'Reduced row count from {numrows} to {len(subdata)} ' f'because of None values in implied column {colkey}') if len(subdata) and len(subdata) < len(data): data = subdata if self.cancel: return # Refresh our count, distinct, distinctcount, min, max for each column for cidx, col in enumerate(colsout): col['count'] = len([row[cidx] for row in data if row[cidx] is not None]) if col['type'] == 'number' and col['count']: col['min'] = min(row[cidx] for row in data if row[cidx] is not None) col['max'] = max(row[cidx] for row in data if row[cidx] is not None) distinct = {str(row[cidx]) if col['type'] == 'string' else row[cidx] for row in data if row[cidx] is not None} if len(distinct) <= self.maxDistinct: col['distinct'] = sorted(distinct) col['distinctcount'] = len(distinct) else: col.pop('distinct', None) col.pop('distinctcount', None) colsout = [{k: v for k, v in c.items() if k != 'where'} for c in colsout] return { 'columns': colsout, 'data': data}