Source code for spear.labeling.prelabels.core

import numpy as np
import enum, json
from typing import Optional

from ..lf_set import LFSet
from ..apply import LFApplier
from ..analysis import LFAnalysis
from ..lf import ABSTAIN 
from ..utils import dump_to_pickle
from ..data_types import DataPoints


[docs]class PreLabels:
    """Generate noisy lables, continuous score  from lf's applied on data  

    Args:
        name (str): Name for this object.
        data (DataPoints): Datapoints.
        gold_labels (Optional[DataPoints]): Labels for datapoints if available.
        rules (LFSet): Set of Rules to generate noisy labels for the dataset.
        exemplars (DataPoints): [description]
    """    
    def __init__(
        self,
        name: str,
        data: DataPoints,
        rules: LFSet,
        num_classes: int,
        labels_enum,
        data_feats: Optional[DataPoints] = np.array([]),
        gold_labels: Optional[DataPoints] = np.array([]),
        exemplars: DataPoints=np.array([]),
    ) -> None:       
        """Instantiates PreLabels class with dataset and set of LFs to noisily label the dataset
        """
        self.name = name
        self._data = data
        self._rules = rules
        self._num_classes = num_classes
        self._labels_enum = labels_enum
        self._data_feats = data_feats
        self._gold_labels = gold_labels
        self._R = exemplars
        self._L = None
        self._S = None

        assert num_classes == len(labels_enum) 
        
        lab_vals = set(item.value for item in self._labels_enum)
        assert len(lab_vals)==self._num_classes 
        
        lab_nams = set(item.name for item in self._labels_enum)
        assert 'ABSTAIN' not in lab_nams

        assert (self._data_feats.shape[0]==0) or (self._data_feats.shape[0]==self._data.shape[0])
        assert (len(self._gold_labels)==self._data.shape[0]) or (self._gold_labels.shape[0]==0)
        unique_labs = set(np.unique(self._gold_labels))
        assert unique_labs.issubset(lab_vals)
        assert (self._R.shape[0]==0) or (self._R.shape[0]==self._data.shape[0] and self._R.shape[1]==len(self._rules))


[docs]    def get_labels(self):
        """Applies LFs to the dataset to generate noisy labels and returns noisy labels and confidence scores

        Returns:
            Tuple(DataPoints, DataPoints): Noisy Labels and Confidences
        """
        if self._L is None or self._L is None:
            applier = LFApplier(lf_set = self._rules)
            L,S = applier.apply(self._data)
            self._L = L
            self._S = S
        return self._L, self._S

[docs]    def analyse_lfs(self,plot=False):
        """Analyse the lfs in LFSet on data

        Args:
            plot (bool, optional): Plot the values. Defaults to False.

        Returns:
            DataFrame: dataframe consisting of Ploarity, Coverage, Overlap, Conflicts, Empirical Acc
        """        
        if self._L is None or self._L is None:
            applier = LFApplier(lf_set = self._rules)
            L,S = applier.apply(self._data)
            self._L = L
            self._S = S
        
        analysis = LFAnalysis(self._labels_enum,self._L,self._rules)
        if len(self._gold_labels) == 0:
            df = analysis.lf_summary(plot=plot)
        else:
            df = analysis.lf_summary(self._gold_labels,plot=plot)
        return df
        

[docs]    def generate_json(self, filename=None):
        """Generates a json file with label value to label name mapping

        Args:
            filename (str, optional): Name for json file. Defaults to None.
        """
        if filename is None:
            filename = self.name+"_json.json"
        
        dic = {}
        for e in self._labels_enum:
            dic[e.value]=e.name

        with open(filename, "w") as outfile:
            json.dump(dic, outfile)
        
[docs]    def generate_pickle(self, filename=None):
        """Generates a pickle file with noisy labels, confidence and other Metadata

        Args:
            filename (str, optional): Name for pickle file. Defaults to None.
        """
        if filename is None:
            filename = self.name+"_pickle.pkl"
        
        if (self._L is None or self._S is None):
            applier = LFApplier(lf_set = self._rules)
            L,S = applier.apply(self._data)
            self._L = L
            self._S = S

        num_inst=self._data.shape[0]
        num_rules=self._L.shape[1]

        x=self._data_feats
        l=self._L

        m=(self._L!=ABSTAIN).astype(int)                                        # lf covers example or not 
        L=self._gold_labels                                                     # true labels
        L = L.reshape(L.size,1)
        d=np.ones((num_inst, 1))                                                # belongs to labeled data or not
        r=self._R                                                               # exemplars

        s=self._S                                                               # continuous scores
        n=np.array([lf._is_cont for lf in self._rules.get_lfs()])               # lf continuous or not
        k=np.array([lf._label.value for lf in self._rules.get_lfs()])           # lf associated to which class

        output = [x,l,m,L,d,r,s,n,k,self._num_classes]
        dump_to_pickle(filename, output)