Source code for spear.Implyloss.data_feeder_utils

import pickle
import numpy as np
import json
#from .config import flags as config

# from .my_data_types import *
from .data_types import *

#reduce_x_features = config.w_network == 'textcnn'
reduce_x_features = False
seq_len = 25

[docs]def change_values(l,user_class_to_num_map):
    '''
    Func Desc:
    Replace the class labels in l by sequential labels - 0,1,2,..

    Input:
    l - the class label matrix
    user_class_to_num_map - dictionary storing mapping from original class labels to sequential labels

    Output:
    l - with sequential labels
    '''
    A = l.shape
    d0 = A[0]
    d1 = A[1]
    for i in range(d0):
        for j in range(d1):
            # print(l[i][j])
            l[i][j] = user_class_to_num_map[l[i][j]]
            # print("Hi")
    return l

[docs]def load_data(fname, jname, num_load=None):
    '''
    Func Desc:
    load the data from the given file

    Input:
    fname - filename
    num_load (default - None)

    Output:
    the structured F_d_U_Data
    '''
    print('Loading from hoff ', fname)
    with open(fname, 'rb') as f:
        x = pickle.load(f)
        l = pickle.load(f)#.astype(np.int32)
        m = pickle.load(f).astype(np.int32)
        L = pickle.load(f).astype(np.int32)
        d = pickle.load(f).astype(np.int32)
        r = pickle.load(f).astype(np.int32)
        a1 = pickle.load(f)
        a2 = pickle.load(f)
        a3 = pickle.load(f)
        num_classes_pickle = pickle.load(f)#.astype(np.int32)

        # len_x = len(x)
        # assert len(l) == len_x
        # assert len(m) == len_x
        # assert len(L) == len_x
        # assert len(d) == len_x
        # assert len(r) == len_x

        # L = np.reshape(L, (L.shape[0], 1))
        # d = np.reshape(d, (d.shape[0], 1))

        print("batch size", x.shape[0])
        print("num features", x.shape[1])
        print("num classes", num_classes_pickle)
        print("num rules", m.shape[1])

        with open(jname, 'rb') as j:
            enum_map_pickle = json.load(j) # {1->Red, 3->Green, 5->Blue}

        # print(type(enum_map_pickle))
        
        user_class_to_num_map =dict()
        val = 0
        for user_class in enum_map_pickle:
            print(user_class," -> ",val)
            user_class_to_num_map[int(user_class)] = val
            # user_class_to_num_map.add(user_class,val)
            val = val+1
        print("None"," -> ",num_classes_pickle)
        user_class_to_num_map[None] = num_classes_pickle

        print("----------------------------")
        print(user_class_to_num_map)
        print("----------------------------")

        len_x = len(x)
        print("len_x", len_x)
        # print(r)
        # print(m.shape)
        if(r.shape[0]==0):
            r = np.zeros((m.shape))
        print("len_r", len(r))
        print("--------------------------")

        print("Working with l")
        # print(l.shape)
        # print(l)
        # print("Part l1") 
        if(l.shape[0]==0):
        # if l is None:
            print("l is empty")
            l=np.empty(len_x)
            l.fill(None)
        # print(l.shape)
        # print(l)
        # print("Part l2") 
        l = change_values(l,user_class_to_num_map)
        # print(l.shape)
        # print(l)
        # print("Part l3") 
        # l = user_class_to_num_map[l]
        # l = np.vectorize(user_class_to_num_map.get)(l)

        print("--------------------------")  

        print("Working with L")
        # print(L.shape)
        # print(L)
        # print("Part L1")   
        if(L.shape[0]==0):
        # if L is None:
            print("L is empty")
            # L=np.empty(len_x)
            # L.fill(None)
            # L = np.reshape(L, (L.shape[0], 1))
            L = np.full((len_x,1),None)
        # print(L.shape)
        # print(L)
        # print("Part L2")
        L = change_values(L,user_class_to_num_map)
        # print(L.shape)
        # print(L)
        # print("Part L3")
        # L = user_class_to_num_map[L]

        print("--------------------------")

        assert len(l) == len_x
        assert len(m) == len_x
        assert len(L) == len_x
        assert len(d) == len_x
        assert len(r) == len_x
        
        d = np.reshape(d, (d.shape[0], 1))

        if reduce_x_features:
            x = np.concatenate([x[:, 0:seq_len], x[:, 75:(seq_len + 75)],
                x[:, 150:(150 + seq_len)]], axis=-1)

        if num_load is not None and num_load < len_x:
            x = x[:num_load]
            l = l[:num_load]
            m = m[:num_load]
            L = L[:num_load]
            d = d[:num_load]
            r = r[:num_load]

        return F_d_U_Data(x, l, m, L, d, r)


[docs]def get_rule_classes(l, num_classes):
    '''
    Func Desc:
    get the different rule_classes 

    Input:
    l ([batch_size, num_rules])
    num_classes (int) - the number of available classes 

    Output:
    rule_classes ([num_rules,1]) - the list of valid classes labelled by rules (say class 2 by r0, class 1 by r1, class 4 by r2 => [2,1,4])
    '''
    # print("rule_class l", l)
    # print(num_classes)
    num_rules = l.shape[1]
    rule_classes = []
    for rule in range(num_rules):
        labels = l[:, rule]
        rule_class = num_classes
        for lbl in labels:
            if lbl != num_classes:
                assert lbl < num_classes
                if rule_class != num_classes:
                    # print("rule", rule)
                    # print("labels", labels)
                    # print("rule_class", rule_class)
                    #print('rule is: ', rule, 'Rule class is: ', rule_class, 'newly found label is: ', lbl, 'num_classes is: ', num_classes)
                    assert(lbl == rule_class)
                else:
                    rule_class = lbl

        if rule_class == num_classes:
            print('No valid label found for rule: ', rule)
            # ok if a rule is just a label (i.e. it does not fire at all)
            # input('Press a key to continue')
        rule_classes.append(rule_class)

    return rule_classes


[docs]def extract_rules_satisfying_min_coverage(m, min_coverage):
    '''
    Func Desc:
    extract the rules that satisfy the specified minimum coverage

    Input:
    m ([batch_size, num_rules]) - mij specifies whether ith example is associated with the jth rule
    min_coverage

    Output:
    satisfying_rules - list of satisfying rules
    not_satisfying_rules - list of not satisfying rules
    rule_map_new_to_old
    rule_map_old_to_new 
    '''
    num_rules = len(m[0])
    coverage = np.sum(m, axis=0)
    satisfying_threshold = coverage >= min_coverage
    not_satisfying_threshold = np.logical_not(satisfying_threshold)
    all_rules = np.arange(num_rules)
    satisfying_rules = np.extract(satisfying_threshold, all_rules)
    not_satisfying_rules = np.extract(not_satisfying_threshold, all_rules)

    # Assert that the extraction is stable
    assert np.all(np.sort(satisfying_rules) == satisfying_rules)
    assert np.all(np.sort(not_satisfying_rules) == not_satisfying_rules)

    rule_map_new_to_old = np.concatenate([satisfying_rules,
            not_satisfying_rules])
    rule_map_old_to_new = np.zeros(num_rules, dtype=all_rules.dtype) - 1
    for new, old in enumerate(rule_map_new_to_old):
        rule_map_old_to_new[old] = new

    return satisfying_rules, not_satisfying_rules, rule_map_new_to_old, rule_map_old_to_new


[docs]def remap_2d_array(arr, map_old_to_new):
    '''
    Func Desc:
    remap those columns of 2D array that are present in map_old_to_new

    Input:
    arr ([batch_size, num_rules])
    map_old_to_new

    Output:
    modified array

    '''
    old = np.arange(len(map_old_to_new))
    arr[:, old] = arr[:, map_old_to_new]
    return arr


[docs]def remap_1d_array(arr, map_old_to_new):
    '''
    Func Desc:
    remap those positions of 1D array that are present in map_old_to_new

    Input:
    arr ([batch_size, num_rules])
    map_old_to_new

    Output:
    modified array
    
    '''
    old = np.arange(len(map_old_to_new))
    arr[old] = arr[map_old_to_new]
    return arr


[docs]def modify_d_or_U_using_rule_map(raw_U_or_d, rule_map_old_to_new):
    '''
    Func Desc:
    Modify d or U using the rule map

    Input:
    raw_U_or_d - the raw data (labelled(d) or unlabelled(U))
    rule_map_old_to_new - the rule map

    Output:
    the modified raw_U_or_d

    '''
    remap_2d_array(raw_U_or_d.l, rule_map_old_to_new)
    remap_2d_array(raw_U_or_d.m, rule_map_old_to_new)


[docs]def shuffle_F_d_U_Data(data):
    '''
    Func Desc:
    shuffle the input data along the 0th axis i.e. among the different instances 

    Input:
    data

    Output:
    the structured and shuffled F_d_U_Data
    '''
    idx = np.arange(len(data.x))
    np.random.shuffle(idx)
    x = np.take(data.x, idx, axis=0)
    l = np.take(data.l, idx, axis=0)
    m = np.take(data.m, idx, axis=0)
    L = np.take(data.L, idx, axis=0)
    d = np.take(data.d, idx, axis=0)
    r = np.take(data.r, idx, axis=0)

    return F_d_U_Data(x, l, m, L, d, r)


[docs]def oversample_f_d(x, labels, sampling_dist):
    '''
    Func Desc:
    Oversample the labelled data using the arguments provided

    Input:
    x ([batch_size, num_features]) - the data
    labels
    samping_dist
    '''
    x_list = []
    L_list = []
    #print('Sampling distribution: ', sampling_dist)
    #print('labels: ', labels[0:4])
    for xx, L in zip(x, labels):
        for i in range(sampling_dist[L]):
            x_list.append(np.array(xx))
            L_list.append(np.array(L))

    return np.array(x_list), np.array(L_list)

[docs]def oversample_d(raw_d, sampling_dist):
    '''
    Func Desc:
    performs oversampling on the raw labelled data using the given distribution

    Input:
    raw_d - raw labelled data
    sampling_dist - the given sampling dist

    Output:
    F_d_U_Data
    '''
    x_list = []
    l_list = []
    m_list = []
    L_list = []
    d_list = []
    r_list = []
    #print('Sampling distribution: ', sampling_dist)
    #print('labels: ', raw_d.L[0:4])
    for x, l, m, L, d, r in zip(raw_d.x, raw_d.l, raw_d.m, raw_d.L, raw_d.d, raw_d.r):
        L1 = np.squeeze(L)
        for i in range(sampling_dist[L1]):
            x_list.append(np.array(x))
            l_list.append(np.array(l))
            m_list.append(np.array(m))
            L_list.append(np.array(L))
            d_list.append(np.array(d))
            r_list.append(np.array(r))

    return F_d_U_Data(np.array(x_list),
            np.array(l_list),
            np.array(m_list),
            np.array(L_list),
            np.array(d_list),
            np.array(r_list))