Source code for spear.Implyloss.utils

import pickle, os, json
import numpy as np
num_args = 9
[docs]def get_data(path):
	'''
	func desc:
	takes the pickle file and arranges it in a matrix list form so as to set the member variables accordingly
	expected order in pickle file is NUMPY arrays x, l, m, L, d, r, s, n, k
	x: [num_instances, num_features]
	l: [num_instances, num_rules]
	m: [num_instances, num_rules]
	L: [num_instances, 1]
	d: [num_instances, 1]
	r: [num_instances, num_rules]
	s: [num_instances, num_rules]
	n: [num_rules] Mask for s
	k: [num_rules] LF classes, range 0 to num_classes-1
	'''
	data=[]
	with open(path,'rb') as file:
		a=pickle.load(file)
		data.append(a) # check if this is required

	assert len(data)==num_args
	return data

[docs]def analyze_w_predictions(x,l,m,L,d,weights,probs,rule_classes):
	''' 
	func desc: 
	analyze the rule network by computing the precisions of the rules and comparing old and new rule stats

	input: 
	x: [num_instances, num_features]
	l: [num_instances, num_rules]
	m: [num_instances, num_rules]
	L: [num_instances, 1]
	d: [num_instances, 1]
	weights: [num_instances, num_rules]
	probs: [num_instances, num_classes]
	rule_classes: [num_rules,1]

	output:
	void, prints the required statistics
	'''
	num_classes = probs.shape[1]
	new_m = convert_weights_to_m(weights) * m
	new_l = convert_m_to_l(new_m,rule_classes,num_classes)
	o_micro,o_marco_p,o_rp = get_rule_precision(l,L,m)
	n_mirco,new_macro_p,n_rp = get_rule_precision(new_l,L,new_m)
	print("old micro precision: ", o_micro)
	print("new micro precision: ", n_mirco)
	print("old rule firings: ", np.sum(m))
	print("new rule firings: ", np.sum(new_m))
	print("old rule coverage: ", len([i for i in m if sum(i) > 0]))
	print("new rule coverage: ", len([i for i in new_m if sum(i) > 0]))

[docs]def convert_weights_to_m(weights):
	'''
	func desc:
	converts weights to m 
	
	input:
	weights([batch_size, num_rules]) - the weights matrix corresponding to rule network(w_network) in the algorithm

	output:
	m([batch_size, num_rules]) - the rule coverage matrix where m_ij = 1 if jth rule covers ith instance
	'''
	new_m = weights > 0.5
	new_m = new_m.astype(np.int32)
	return new_m

[docs]def convert_m_to_l(m,rule_classes,num_classes):
	'''
	func desc:
	converts m to l

	input:
	m([batch_size, num_rules]) - the rule coverage matrix where m_ij = 1 if jth rule covers ith instance
	rule_classes - 
	num_classes(non_negative integer) - number of available classes

	output:
	l([batch_size, num_rules]) - labels assigned by the rules
	'''
	rule_classes = np.array([rule_classes]*m.shape[0])
	l = m * rule_classes + (1-m)*num_classes
	return l

[docs]def get_rule_precision(l,L,m):
	'''
	func desc:
	get the precision of the rules

	input:
	l([batch_size, num_rules]) - labels assigned by the rules
	L([batch_size, 1]) - L_i = 1 if the ith instance has already a label assigned to it in the dataset
	m([batch_size, num_rules]) - the rule coverage matrix where m_ij = 1 if jth rule covers ith instance

	output:
	micro_p - 
	macro_p -
	comp - 
	'''
	L = L.reshape([L.shape[0],1])
	comp = np.equal(l,L).astype(np.float)
	comp = comp * m
	comp = np.sum(comp,0)
	support = np.sum(m,0)
	micro_p = np.sum(comp)/np.sum(support)
	macro_p = comp/(support + 1e-25)
	supported_rules = [idx for idx,support_val in enumerate(support) if support_val>0]
	macro_p = macro_p[supported_rules]
	macro_p = np.mean(macro_p)
	return micro_p,macro_p,comp/(support + 1e-25)


# from utils
[docs]def merge_dict_a_into_b(a, b):
	'''
	func desc:
	set the dict values of b to that of a

	input:
	a, b : dicts

	output:
	void
	'''
	for key in a:
		assert key not in b
		b[key] = a[key]

[docs]def print_tf_global_variables():
	'''
	Func Desc:
	prints all the global variables

	Input:

	Output:

	'''
	# import tensorflow as tf
	import tensorflow.compat.v1 as tf
	tf.disable_v2_behavior()
	print(json.dumps([str(foo) for foo in tf.global_variables()], indent=4))

[docs]def print_var_list(var_list):
	'''
	Func Desc:
	Prints the given variable list

	Input:
	var_list

	Output:

	'''
	print(json.dumps([str(foo) for foo in var_list], indent=4))

[docs]def pretty_print(data_structure):
	'''
	Func Desc:
	prints the given data structure in the desired format

	Input:
	data_structure

	Output:

	'''
	print(json.dumps(data_structure, indent=4))

[docs]def get_list_or_None(s, dtype=int):
	'''
	Func Desc:
	Returns the list of types of the variables in the string s

	Input:
	s - string
	dtype function (default - int)

	Output:
	None or list
	'''
	if s.strip() == '':
		return None
	else:
		lst = s.strip().split(',')
		return [dtype(x) for x in lst]

[docs]def get_list(s):
	'''
	Func Desc:
	returns the output of get_list_or_None as a list

	Input:
	s - list

	Output:
	lst - list
	'''
	lst = get_list_or_None(s)
	if lst is None:
		return []
	else:
		return lst

[docs]def None_if_zero(n):
	'''
	Func Desc:
	the max(0,n) function with none id n<=0

	Input:
	n - integer

	Output:
	if n>0 then n else None
	'''
	if n <= 0:
		return None
	else:
		return n

[docs]def boolean(s):
	'''
	Func Desc:
	returns the expected boolean value for the given string

	Input:
	s - string

	Output:
	boolean or error
	'''
	if s == 'True':
		return True
	if s == 'False':
		return False
	raise ValueError('Invalid boolean value: %s' % s)

[docs]def set_to_list_of_values_if_None_or_empty(lst, val, num_vals):
	'''
	Func Desc:
	returns lst if it is not empty else returns a same length list but with all its entries equal to val
	lst - list
	val - value 
	num_vals (integer) - length of the list lst

	Output:
	lst or same length val list
	'''
	if not lst:
		return [val] * num_vals
	else:
		print(len(lst), num_vals)
		assert len(lst) == num_vals
		return lst


# from snorkel_utils
[docs]def conv_l_to_lsnork(l,m):
	'''
	func desc:
	in snorkel convention
	if a rule does not cover an instance assign it label -1
	we follow the convention where we assign the label num_classes instead of -1
	valid class labels range from {0,1,...num_classes-1}
	conv_l_to_lsnork:  converts l in our format to snorkel's format

	input:
	l([batch_size, num_rules]) - rule label matrix
	m([batch_size, num_rules]) - rule coverage matrix
	
	output:
	lsnork([batch_size, num_rules])
	'''
	lsnork = l*m + -1*(1-m)
	return lsnork.astype(np.int)

# from metric_utils
[docs]def compute_accuracy(support, recall):
	'''
	func desc:
	compute the required accuracy 

	input:
	support 
	recall 

	output:
	accuracy
	'''
	return np.sum(support * recall) / np.sum(support)

# from data_utils
[docs]def dump_labels_to_file(save_filename, x, l, m, L, d, weights=None, f_d_U_probs=None, rule_classes=None):
	'''
	Func Desc:
	dumps the given data into a pickle file

	Input:
	save_filename - the name of the pickle file in which the arguments/data is required to be saved
	x ([batch_size x num_features])
	l ([batch_size x num_rules])
	m ([batch_size x num_rules])
	L ([batch_size x 1])
	d ([batch_size x 1])
	weights (default - None)
	f_d_U_probs (default - None)
	rule_classes  (default - None)

	Output:

	'''
	save_file = open(save_filename, 'wb')
	pickle.dump(x, save_file)
	pickle.dump(l, save_file)
	pickle.dump(m, save_file)
	pickle.dump(L, save_file)
	pickle.dump(d, save_file)

	if not weights is None:
		pickle.dump(weights, save_file)

	if not f_d_U_probs is None:
		pickle.dump(f_d_U_probs, save_file)

	if not rule_classes is None:
		pickle.dump(rule_classes,save_file)

	save_file.close()

[docs]def load_from_pickle_with_per_class_sampling_factor(fname, per_class_sampling_factor):
	'''
	Func Desc:
	load the data from the given pickle file with per class sampling factor

	Input:
	fname - name of the pickle file from which data need to be loaded
	per_class_sampling_factor 

	Output:
	the required matrices
	x1 ([batch_size x num_features])
	l1 ([batch_size x num_rules])
	m1 ([batch_size x num_rules])
	L1 ([batch_size x 1])
	d1 ([batch_size x 1])
	'''
	with open(fname, 'rb') as f:
		x = pickle.load(f)
		l = pickle.load(f)
		m = pickle.load(f)
		L = pickle.load(f)
		d = np.squeeze(pickle.load(f))

	x1 = []
	l1 = []
	m1 = []
	L1 = []
	d1 = []
	for xx, ll, mm, LL, dd in zip(x, l, m, L, d):
		for i in range(per_class_sampling_factor[LL]):
			x1.append(xx)
			l1.append(ll)
			m1.append(mm)
			L1.append(LL)
			d1.append(dd)

	x1 = np.array(x1)
	l1 = np.array(l1)
	m1 = np.array(m1)
	L1 = np.array(L1)
	d1 = np.array(d1)

	return x1, l1, m1, L1, d1


[docs]def combine_d_covered_U_pickles(d_name, infer_U_name, out_name, d_sampling_factor, U_sampling_factor):
	'''
	Func Desc:
	combine the labelled and unlabelled data, merge the corresponding parameters together and store them in new file

	Input:
	d_name - the pickle file storing labelled data
	infer_U_name - the pickle file storing unlabelled data
	out_name - the name of the file where merged output needs to be stored
	d_sampling_factor - the per_class_sampling_factor for labelled data
	U_sampling_factor - the per_class_sampling_factor for unlabelled data

	Output:

	'''
	#d_sampling_factor = np.array(d_sampling_factor)
	#U_sampling_factor = np.array(U_sampling_factor)

	d_x, d_l, d_m, d_L, d_d = load_from_pickle_with_per_class_sampling_factor(d_name, d_sampling_factor)
	U_x, U_l, U_m, U_L, U_d = load_from_pickle_with_per_class_sampling_factor(infer_U_name, U_sampling_factor)

	x = np.concatenate((d_x, U_x))
	l = np.concatenate((d_l, U_l))
	m = np.concatenate((d_m, U_m))
	L = np.concatenate((d_L, U_L))
	#print(d_d.shape)
	#print(U_d.shape)
	d = np.concatenate((d_d, U_d))

	with open(out_name, 'wb') as out_file:
		pickle.dump(x, out_file)
		pickle.dump(l, out_file)
		pickle.dump(m, out_file)
		pickle.dump(L, out_file)
		pickle.dump(d, out_file)

# from learn2reweight_utils
[docs]def updated_theta_copy(grads, variables, lr, mode):
	'''
	Func Desc:
	updates the theta (parameters) using rhe given learning rate, grads and variables

	Input:
	grads - gradients
	variables
	lr - learning rate
	mode 

	Output:
	vals - list of the updated gradients 
	'''
	vals = []
	if mode == 1:
		for g,v in zip(grads,variables):
			vals.append(v+lr*g)
	elif mode == -1:
		for g,v in zip(grads,variables):
			vals.append(v-lr*g)
	else:
		print("invalid mode error!")
		print(exit(1))

	return vals