diff --git a/Code/10ops.txt b/Code/10ops.txt new file mode 100644 index 0000000..b38f7ee --- /dev/null +++ b/Code/10ops.txt @@ -0,0 +1 @@ +0~*DJORPEL diff --git a/Code/S_NN_train.py b/Code/S_NN_train.py index fa3fc90..da8983a 100644 --- a/Code/S_NN_train.py +++ b/Code/S_NN_train.py @@ -41,7 +41,7 @@ def rmse_loss(pred, targ): denom = torch.sqrt(denom.sum()/len(denom)) return torch.sqrt(F.mse_loss(pred, targ))/denom -def NN_train(pathdir, filename, epochs=-1): +def NN_train(pathdir, filename, epochs=1000, lrs=1e-2, N_red_lr=4, pretrained_path=""): try: os.mkdir("results/NN_trained_models/") except: @@ -55,13 +55,14 @@ def NN_train(pathdir, filename, epochs=-1): n_variables = np.loadtxt(pathdir+"%s" %filename, dtype='str').shape[1]-1 variables = np.loadtxt(pathdir+"%s" %filename, usecols=(0,)) - epochs = epochs//4 + epochs = epochs//N_red_lr + epochs = int(epochs) - if n_variables==0: - print("Solved! ", variables[0]) + if n_variables==0 or n_variables==1: + print("Solved!")#, variables[0]) return 0 - elif n_variables==1: - variables = np.reshape(variables,(len(variables),1)) + #elif n_variables==1: + # variables = np.reshape(variables,(len(variables),1)) else: for j in range(1,n_variables): v = np.loadtxt(pathdir+"%s" %filename, usecols=(j,)) @@ -113,10 +114,14 @@ def NN_train(pathdir, filename, epochs=-1): else: model_feynman = SimpleNet(n_variables) + if pretrained_path!="": + model_feynman.load_state_dict(torch.load(pretrained_path)) + max_loss = 10000 - lrs = 1e-2 - for i_i in range(4): + print("EPOCH", epochs) + + for i_i in range(N_red_lr): optimizer_feynman = optim.Adam(model_feynman.parameters(), lr = lrs) for epoch in range(epochs): model_feynman.train() diff --git a/Code/S_add_bf_on_numbers_on_pareto.py b/Code/S_add_bf_on_numbers_on_pareto.py new file mode 100644 index 0000000..6346ac2 --- /dev/null +++ b/Code/S_add_bf_on_numbers_on_pareto.py @@ -0,0 +1,122 @@ +# Adds on the pareto all the snapped versions of a given expression (all paramters are snapped in the end) + +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import torch.utils.data as utils +from torch.autograd import Variable +from sklearn.metrics import roc_curve, auc +from sklearn.preprocessing import label_binarize +from sklearn.manifold import TSNE +import copy +import warnings +warnings.filterwarnings("ignore") +import sympy +from S_snap import integerSnap +from S_snap import zeroSnap +from S_snap import rationalSnap +from S_get_symbolic_expr_error import get_symbolic_expr_error +from get_pareto import Point, ParetoSet +from S_brute_force_number import brute_force_number + +from sympy import preorder_traversal, count_ops +from sympy.abc import x,y +from sympy.parsing.sympy_parser import parse_expr +from sympy import Symbol, lambdify, N, simplify, powsimp +from RPN_to_eq import RPN_to_eq + +from S_get_number_DL_snapped import get_number_DL_snapped + +# parameters: path to data, math (not RPN) expression +def add_bf_on_numbers_on_pareto(pathdir, filename, PA, math_expr): + def unsnap_recur(expr, param_dict, unsnapped_param_dict): + """Recursively transform each numerical value into a learnable parameter.""" + import sympy + from sympy import Symbol + if isinstance(expr, sympy.numbers.Float) or isinstance(expr, sympy.numbers.Integer) or isinstance(expr, sympy.numbers.Rational) or isinstance(expr, sympy.numbers.Pi): + used_param_names = list(param_dict.keys()) + list(unsnapped_param_dict) + unsnapped_param_name = get_next_available_key(used_param_names, "p", is_underscore=False) + unsnapped_param_dict[unsnapped_param_name] = float(expr) + unsnapped_expr = Symbol(unsnapped_param_name) + return unsnapped_expr + elif isinstance(expr, sympy.symbol.Symbol): + return expr + else: + unsnapped_sub_expr_list = [] + for sub_expr in expr.args: + unsnapped_sub_expr = unsnap_recur(sub_expr, param_dict, unsnapped_param_dict) + unsnapped_sub_expr_list.append(unsnapped_sub_expr) + return expr.func(*unsnapped_sub_expr_list) + + + def get_next_available_key(iterable, key, midfix="", suffix="", is_underscore=True): + """Get the next available key that does not collide with the keys in the dictionary.""" + if key + suffix not in iterable: + return key + suffix + else: + i = 0 + underscore = "_" if is_underscore else "" + while "{}{}{}{}{}".format(key, underscore, midfix, i, suffix) in iterable: + i += 1 + new_key = "{}{}{}{}{}".format(key, underscore, midfix, i, suffix) + return new_key + + eq = parse_expr(str(math_expr)) + expr = eq + # Get the numbers appearing in the expression + is_atomic_number = lambda expr: expr.is_Atom and expr.is_number + eq_numbers = [subexpression for subexpression in preorder_traversal(expr) if is_atomic_number(subexpression)] + # Do bf on one parameter at a time + bf_on_numbers_expr = [] + for w in range(len(eq_numbers)): + param_dict = {} + unsnapped_param_dict = {'p':1} + eq_ = unsnap_recur(expr,param_dict,unsnapped_param_dict) + eq = eq_ + + np.savetxt(pathdir+"number_for_bf_%s.txt" %w, [eq_numbers[w]]) + brute_force_number(pathdir,"number_for_bf_%s.txt" %w) + # Load the predictions made by the bf code + bf_numbers = np.loadtxt("results.dat",usecols=(1,),dtype="str") + new_numbers = copy.deepcopy(eq_numbers) + + # replace the number under consideration by all the proposed bf numbers + for kk in range(len(bf_numbers)): + eq = eq_ + new_numbers[w] = parse_expr(RPN_to_eq(bf_numbers[kk])) + + jj = 0 + for parm in unsnapped_param_dict: + if parm!="p": + eq = eq.subs(parm, new_numbers[jj]) + jj = jj + 1 + + bf_on_numbers_expr = bf_on_numbers_expr + [eq] + for i in range(len(bf_on_numbers_expr)): + try: + # Calculate the error of the new, snapped expression + snapped_error = get_symbolic_expr_error(pathdir,filename,str(bf_on_numbers_expr[i])) + # Calculate the complexity of the new, snapped expression + expr = simplify(powsimp(bf_on_numbers_expr[i])) + is_atomic_number = lambda expr: expr.is_Atom and expr.is_number + numbers_expr = [subexpression for subexpression in preorder_traversal(expr) if is_atomic_number(subexpression)] + + snapped_complexity = 0 + for j in numbers_expr: + snapped_complexity = snapped_complexity + get_number_DL_snapped(float(j)) + # Add the complexity due to symbols + n_variables = len(expr.free_symbols) + n_operations = len(count_ops(expr,visual=True).free_symbols) + if n_operations!=0 or n_variables!=0: + snapped_complexity = snapped_complexity + (n_variables+n_operations)*np.log2((n_variables+n_operations)) + + PA.add(Point(x=snapped_complexity, y=snapped_error, data=str(expr))) + except: + continue + + return(PA) + diff --git a/Code/S_add_snap_expr_on_pareto.py b/Code/S_add_snap_expr_on_pareto.py index 3ce06d7..fde8182 100644 --- a/Code/S_add_snap_expr_on_pareto.py +++ b/Code/S_add_snap_expr_on_pareto.py @@ -26,10 +26,15 @@ from get_pareto import Point, ParetoSet from sympy import preorder_traversal, count_ops from sympy.abc import x,y from sympy.parsing.sympy_parser import parse_expr -from sympy import Symbol, lambdify, N, simplify, powsimp, Rational, symbols +from sympy import Symbol, lambdify, N, simplify, powsimp, Rational, symbols, S,Float from S_get_number_DL_snapped import get_number_DL_snapped +def intify(expr): + floats = S(expr).atoms(Float) + ints = [i for i in floats if int(i) == i] + return expr.xreplace(dict(zip(ints, [int(i) for i in ints]))) + # parameters: path to data, math (not RPN) expression def add_snap_expr_on_pareto(pathdir, filename, math_expr, PA, DR_file=""): def unsnap_recur(expr, param_dict, unsnapped_param_dict): @@ -66,7 +71,7 @@ def add_snap_expr_on_pareto(pathdir, filename, math_expr, PA, DR_file=""): eq = parse_expr(str(math_expr)) expr = eq - + # Get the numbers appearing in the expression is_atomic_number = lambda expr: expr.is_Atom and expr.is_number eq_numbers = [subexpression for subexpression in preorder_traversal(expr) if is_atomic_number(subexpression)] @@ -134,8 +139,7 @@ def add_snap_expr_on_pareto(pathdir, filename, math_expr, PA, DR_file=""): snapped_expr = np.append(integer_snapped_expr,zero_snapped_expr) snapped_expr = np.append(snapped_expr,rational_snapped_expr) - - + for i in range(len(snapped_expr)): try: # Calculate the error of the new, snapped expression @@ -145,6 +149,8 @@ def add_snap_expr_on_pareto(pathdir, filename, math_expr, PA, DR_file=""): for s in (expr.free_symbols): s = symbols(str(s), real = True) expr = simplify(parse_expr(str(snapped_expr[i]),locals())) + #print("expr 0", expr) + expr = intify(expr) is_atomic_number = lambda expr: expr.is_Atom and expr.is_number numbers_expr = [subexpression for subexpression in preorder_traversal(expr) if is_atomic_number(subexpression)] @@ -167,12 +173,14 @@ def add_snap_expr_on_pareto(pathdir, filename, math_expr, PA, DR_file=""): for i_dr in range(len(old_vars)): expr = expr.replace(old_vars[i_dr],"("+dr_data[i_dr+2]+")") expr = "("+dr_data[1]+")*(" + expr +")" - + expr = parse_expr(expr) for s in (expr.free_symbols): s = symbols(str(s), real = True) expr = simplify(parse_expr(str(expr),locals())) - + #print("expr 1", expr) + #expr = intify(expr) + #print("expr 2", expr) snapped_complexity = 0 for j in numbers_expr: snapped_complexity = snapped_complexity + get_number_DL_snapped(float(j)) diff --git a/Code/S_add_snap_expr_on_pareto_polyfit.py b/Code/S_add_snap_expr_on_pareto_polyfit.py new file mode 100644 index 0000000..cfc96cd --- /dev/null +++ b/Code/S_add_snap_expr_on_pareto_polyfit.py @@ -0,0 +1,129 @@ +# Adds on the pareto all the snapped versions of a given expression (all paramters are snapped in the end) + +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import torch.utils.data as utils +from torch.autograd import Variable +from sklearn.metrics import roc_curve, auc +from sklearn.preprocessing import label_binarize +from sklearn.manifold import TSNE +import seaborn as sns +import copy +import warnings +warnings.filterwarnings("ignore") +import sympy +from S_snap import integerSnap +from S_snap import zeroSnap +from S_snap import rationalSnap +from S_get_symbolic_expr_error import get_symbolic_expr_error +from get_pareto import Point, ParetoSet + +from sympy import preorder_traversal, count_ops +from sympy.abc import x,y +from sympy.parsing.sympy_parser import parse_expr +from sympy import Symbol, lambdify, N, simplify, powsimp, Rational, symbols, S, Float + +from S_get_number_DL_snapped import get_number_DL_snapped + +def intify(expr): + floats = S(expr).atoms(Float) + ints = [i for i in floats if int(i) == i] + return expr.xreplace(dict(zip(ints, [int(i) for i in ints]))) + +# parameters: path to data, math (not RPN) expression +def add_snap_expr_on_pareto_polyfit(pathdir, filename, math_expr, PA): + def unsnap_recur(expr, param_dict, unsnapped_param_dict): + """Recursively transform each numerical value into a learnable parameter.""" + import sympy + from sympy import Symbol + if isinstance(expr, sympy.numbers.Float) or isinstance(expr, sympy.numbers.Integer) or isinstance(expr, sympy.numbers.Rational) or isinstance(expr, sympy.numbers.Pi): + used_param_names = list(param_dict.keys()) + list(unsnapped_param_dict) + unsnapped_param_name = get_next_available_key(used_param_names, "p", is_underscore=False) + unsnapped_param_dict[unsnapped_param_name] = float(expr) + unsnapped_expr = Symbol(unsnapped_param_name) + return unsnapped_expr + elif isinstance(expr, sympy.symbol.Symbol): + return expr + else: + unsnapped_sub_expr_list = [] + for sub_expr in expr.args: + unsnapped_sub_expr = unsnap_recur(sub_expr, param_dict, unsnapped_param_dict) + unsnapped_sub_expr_list.append(unsnapped_sub_expr) + return expr.func(*unsnapped_sub_expr_list) + + + def get_next_available_key(iterable, key, midfix="", suffix="", is_underscore=True): + """Get the next available key that does not collide with the keys in the dictionary.""" + if key + suffix not in iterable: + return key + suffix + else: + i = 0 + underscore = "_" if is_underscore else "" + while "{}{}{}{}{}".format(key, underscore, midfix, i, suffix) in iterable: + i += 1 + new_key = "{}{}{}{}{}".format(key, underscore, midfix, i, suffix) + return new_key + + eq = parse_expr(str(math_expr)) + expr = eq + + # Get the numbers appearing in the expression + is_atomic_number = lambda expr: expr.is_Atom and expr.is_number + eq_numbers = [subexpression for subexpression in preorder_traversal(expr) if is_atomic_number(subexpression)] + + # Do zero snap one parameter at a time + zero_snapped_expr = [] + for w in range(len(eq_numbers)): + param_dict = {} + unsnapped_param_dict = {'p':1} + eq = unsnap_recur(expr,param_dict,unsnapped_param_dict) + new_numbers = zeroSnap(eq_numbers,w+1) + for kk in range(len(new_numbers)): + eq_numbers[new_numbers[kk][0]] = new_numbers[kk][1] + jj = 0 + for parm in unsnapped_param_dict: + if parm!="p": + eq = eq.subs(parm, eq_numbers[jj]) + jj = jj + 1 + zero_snapped_expr = zero_snapped_expr + [eq] + + for i in range(len(zero_snapped_expr)): + try: + + # Calculate the error of the new, snapped expression + snapped_error = get_symbolic_expr_error(pathdir,filename,str(zero_snapped_expr[i])) + # Calculate the complexity of the new, snapped expression + expr = simplify(powsimp(zero_snapped_expr[i])) + for s in (expr.free_symbols): + s = symbols(str(s), real = True) + expr = simplify(parse_expr(str(zero_snapped_expr[i]),locals())) + expr = intify(expr) + + is_atomic_number = lambda expr: expr.is_Atom and expr.is_number + numbers_expr = [subexpression for subexpression in preorder_traversal(expr) if is_atomic_number(subexpression)] + + snapped_complexity = 0 + for j in numbers_expr: + snapped_complexity = snapped_complexity + get_number_DL_snapped(float(j)) + + # Add the complexity due to symbols + n_variables = len(expr.free_symbols) + n_operations = len(count_ops(expr,visual=True).free_symbols) + if n_operations!=0 or n_variables!=0: + snapped_complexity = snapped_complexity + (n_variables+n_operations)*np.log2((n_variables+n_operations)) + + PA.add(Point(x=snapped_complexity, y=snapped_error, data=str(expr))) + except: + print("error") + print("") + continue + return(PA) + + + + diff --git a/Code/S_brute_force_number.py b/Code/S_brute_force_number.py new file mode 100644 index 0000000..28516e5 --- /dev/null +++ b/Code/S_brute_force_number.py @@ -0,0 +1,27 @@ +# runs BF on data and saves the best RPN expressions in results.dat +# all the .dat files are created after I run this script +# the .scr are needed to run the fortran code + +import numpy as np +import os +import shutil +import subprocess +from subprocess import call +import sys +import csv +import sympy as sp +from sympy.parsing.sympy_parser import parse_expr + +def brute_force_number(pathdir,filename): + try_time = 2 + file_type = "10ops.txt" + + try: + os.remove("results.dat") + except: + pass + + subprocess.call(["./brute_force_oneFile_v1.scr", file_type, "%s" %try_time, pathdir+filename]) + + return 1 + diff --git a/Code/S_final_gd.py b/Code/S_final_gd.py new file mode 100644 index 0000000..6aac63c --- /dev/null +++ b/Code/S_final_gd.py @@ -0,0 +1,147 @@ +# Turns a mathematical expression (already RPN turned) to pytorch expression, trains the parameters, and returns the new error, complexity and the new symbolic expression + +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import torch.utils.data as utils +from torch.autograd import Variable +from sklearn.metrics import roc_curve, auc +from sklearn.preprocessing import label_binarize +from sklearn.manifold import TSNE +import seaborn as sns +import warnings +warnings.filterwarnings("ignore") +import sympy + +from sympy import * +from sympy.abc import x,y +from sympy.parsing.sympy_parser import parse_expr +from sympy import Symbol, lambdify, N + +from S_get_number_DL import get_number_DL + +# parameters: path to data, RPN expression (obtained from bf) +def final_gd(data_file, math_expr, lr = 1e-2, N_epochs = 5000): + param_dict = {} + unsnapped_param_dict = {'p':1} + + def unsnap_recur(expr, param_dict, unsnapped_param_dict): + """Recursively transform each numerical value into a learnable parameter.""" + import sympy + from sympy import Symbol + if isinstance(expr, sympy.numbers.Float): + used_param_names = list(param_dict.keys()) + list(unsnapped_param_dict) + unsnapped_param_name = get_next_available_key(used_param_names, "p", is_underscore=False) + unsnapped_param_dict[unsnapped_param_name] = float(expr) + unsnapped_expr = Symbol(unsnapped_param_name) + return unsnapped_expr + elif isinstance(expr, sympy.symbol.Symbol): + return expr + else: + unsnapped_sub_expr_list = [] + for sub_expr in expr.args: + unsnapped_sub_expr = unsnap_recur(sub_expr, param_dict, unsnapped_param_dict) + unsnapped_sub_expr_list.append(unsnapped_sub_expr) + return expr.func(*unsnapped_sub_expr_list) + + + def get_next_available_key(iterable, key, midfix="", suffix="", is_underscore=True): + """Get the next available key that does not collide with the keys in the dictionary.""" + if key + suffix not in iterable: + return key + suffix + else: + i = 0 + underscore = "_" if is_underscore else "" + while "{}{}{}{}{}".format(key, underscore, midfix, i, suffix) in iterable: + i += 1 + new_key = "{}{}{}{}{}".format(key, underscore, midfix, i, suffix) + return new_key + + # Load the actual data + + data = np.loadtxt(data_file) + + # Turn BF expression to pytorch expression + eq = parse_expr(math_expr) + # eq = parse_expr("cos(0.5*a)") # this is for test_sympy.txt + # eq = parse_expr("cos(0.5*a)+sin(1.2*b)+6") # this is for test_sympy_2.txt + eq = unsnap_recur(eq,param_dict,unsnapped_param_dict) + + + N_vars = len(data[0])-1 + N_params = len(unsnapped_param_dict) + + possible_vars = ["x%s" %i for i in np.arange(0,30,1)] + variables = [] + params = [] + for i in range(N_vars): + #variables = variables + ["x%s" %i] + variables = variables + [possible_vars[i]] + for i in range(N_params-1): + params = params + ["p%s" %i] + + symbols = params + variables + + f = lambdify(symbols, N(eq), torch) + + # Set the trainable parameters in the expression + + trainable_parameters = [] + for i in unsnapped_param_dict: + if i!="p": + vars()[i] = torch.tensor(unsnapped_param_dict[i]) + vars()[i].requires_grad=True + trainable_parameters = trainable_parameters + [vars()[i]] + + + # Prepare the loaded data + + real_variables = [] + for i in range(len(data[0])-1): + real_variables = real_variables + [torch.from_numpy(data[:,i]).float()] + + input = trainable_parameters + real_variables + + y = torch.from_numpy(data[:,-1]).float() + + + for i in range(N_epochs): + # this order is fixed i.e. first parameters + yy = f(*input) + loss = torch.mean((yy-y)**2) + loss.backward() + with torch.no_grad(): + for j in range(N_params-1): + trainable_parameters[j] -= lr * trainable_parameters[j].grad + trainable_parameters[j].grad.zero_() + + for i in range(N_epochs): + # this order is fixed i.e. first parameters + yy = f(*input) + loss = torch.mean((yy-y)**2) + loss.backward() + with torch.no_grad(): + for j in range(N_params-1): + trainable_parameters[j] -= lr/10 * trainable_parameters[j].grad + trainable_parameters[j].grad.zero_() + + # get the updated symbolic regression + ii = -1 + complexity = 0 + for parm in unsnapped_param_dict: + if ii == -1: + ii = ii + 1 + else: + eq = eq.subs(parm, trainable_parameters[ii]) + complexity = complexity + get_number_DL(trainable_parameters[ii].detach().numpy()) + ii = ii+1 + + + error = torch.mean((f(*input)-y)**2).data.numpy()*1 + return error, complexity, eq + + diff --git a/Code/S_polyfit.py b/Code/S_polyfit.py index b6e7d04..d017283 100644 --- a/Code/S_polyfit.py +++ b/Code/S_polyfit.py @@ -45,11 +45,11 @@ def polyfit(maxdeg, filename): z = z + ["z"+str(ii)] variables = np.matmul(C_1_2,variables.T).T - - parameters = getBest(variables,f_dependent,maxdeg)[0] - params_error = getBest(variables,f_dependent,maxdeg)[1] - deg = getBest(variables,f_dependent,maxdeg)[2] - + res = getBest(variables,f_dependent,maxdeg) + parameters = res[0] + params_error = res[1] + deg = res[2] + x = sympy.Matrix(x) M = sympy.Matrix(C_1_2) b = sympy.Matrix(means) @@ -64,11 +64,15 @@ def polyfit(maxdeg, filename): eq = simplify(eq) else: - parameters = getBest(variables,f_dependent,maxdeg)[0] - params_error = getBest(variables,f_dependent,maxdeg)[1] - deg = getBest(variables,f_dependent,maxdeg)[2] + res = getBest(variables,f_dependent,maxdeg) + parameters = res[0] + params_error = res[1] + deg = res[2] eq = mk_sympy_function(parameters,n_variables,deg) - eq = eq.subs("z0","x0") - + try: + eq = eq.subs("z0","x0") + except: + pass + return (eq, params_error) diff --git a/Code/S_remove_input_neuron.py b/Code/S_remove_input_neuron.py new file mode 100644 index 0000000..241482c --- /dev/null +++ b/Code/S_remove_input_neuron.py @@ -0,0 +1,29 @@ +# Remove on input neuron from a NN + +from __future__ import print_function +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torchvision import datasets, transforms +import pandas as pd +import numpy as np +import torch +from torch.utils import data +import pickle +from matplotlib import pyplot as plt +import torch.utils.data as utils +import time +import os + +def remove_input_neuron(net,n_inp,idx_neuron,ct_median,save_filename): + removed_weights = net.linear1.weight[:,idx_neuron] + # Remove the weights associated with the removed input neuron + t = torch.transpose(net.linear1.weight,0,1) + preserved_ids = torch.LongTensor(np.array(list(set(range(n_inp)) - set([idx_neuron])))) + t = nn.Parameter(t[preserved_ids, :]) + net.linear1.weight = nn.Parameter(torch.transpose(t,0,1)) + # Adjust the biases + net.linear1.bias = nn.Parameter(net.linear1.bias+torch.tensor(ct_median*removed_weights).float().cuda()) + torch.save(net.state_dict(), save_filename) + diff --git a/Code/S_run_aifeynman.py b/Code/S_run_aifeynman.py index 6343186..4ff00e9 100644 --- a/Code/S_run_aifeynman.py +++ b/Code/S_run_aifeynman.py @@ -20,7 +20,8 @@ from S_get_symbolic_expr_error import get_symbolic_expr_error from S_add_snap_expr_on_pareto import add_snap_expr_on_pareto from S_add_sym_on_pareto import add_sym_on_pareto from S_run_bf_polyfit import run_bf_polyfit - +from S_final_gd import final_gd +from dimensionalAnalysis import dimensionalAnalysis PA = ParetoSet() @@ -36,8 +37,7 @@ def run_AI_all(pathdir,filename,BF_try_time=60,BF_ops_file_type="14ops", polyfit # Run bf and polyfit PA = run_bf_polyfit(pathdir,pathdir,filename,BF_try_time,BF_ops_file_type, PA, polyfit_deg) - - + ''' # Run bf and polyfit on modified output PA = get_acos(pathdir,"results/mystery_world_acos/",filename,BF_try_time,BF_ops_file_type, PA, polyfit_deg) PA = get_asin(pathdir,"results/mystery_world_asin/",filename,BF_try_time,BF_ops_file_type, PA, polyfit_deg) @@ -50,13 +50,21 @@ def run_AI_all(pathdir,filename,BF_try_time=60,BF_ops_file_type="14ops", polyfit PA = get_sqrt(pathdir,"results/mystery_world_sqrt/",filename,BF_try_time,BF_ops_file_type, PA, polyfit_deg) PA = get_squared(pathdir,"results/mystery_world_squared/",filename,BF_try_time,BF_ops_file_type, PA, polyfit_deg) PA = get_tan(pathdir,"results/mystery_world_tan/",filename,BF_try_time,BF_ops_file_type, PA, polyfit_deg) + ''' ############################################################################################################################# # check if the NN is trained. If it is not, train it on the data. print("Checking for symmetry \n", filename) - if path.exists("results/NN_trained_models/models/" + filename + ".h5") or len(data[0])<3: + if len(data[0])<3: + print("Just one variable!") + pass + elif path.exists("results/NN_trained_models/models/" + filename + ".h5"):# or len(data[0])<3: print("NN already trained \n") print("NN loss: ", NN_eval(pathdir,filename), "\n") + elif path.exists("results/NN_trained_models/models/" + filename + "_pretrained.h5"): + print("Found pretrained NN \n") + NN_train(pathdir,filename,NN_epochs/2,lrs=1e-3,N_red_lr=3,pretrained_path="results/NN_trained_models/models/" + filename + "_pretrained.h5") + print("NN loss after training: ", NN_eval(pathdir,filename), "\n") else: print("Training a NN on the data... \n") NN_train(pathdir,filename,NN_epochs) @@ -129,27 +137,67 @@ def run_AI_all(pathdir,filename,BF_try_time=60,BF_ops_file_type="14ops", polyfit return PA # this runs snap on the output of aifeynman -def run_aifeynman(pathdir,filename,BF_try_time,BF_ops_file_type, polyfit_deg=4, NN_epochs=4000, DR_file=""): +def run_aifeynman(pathdir,filename,BF_try_time,BF_ops_file_type, polyfit_deg=4, NN_epochs=4000, vars_name=[],test_percentage=20): + # If the variable names are passed, do the dimensional analysis first + filename_orig = filename + try: + if vars_name!=[]: + dimensionalAnalysis(pathdir,filename,vars_name) + DR_file = filename + "_dim_red_variables.txt" + filename = filename + "_dim_red" + else: + DR_file = "" + except: + DR_file = "" + # Split the data into train and test set input_data = np.loadtxt(pathdir+filename) sep_idx = np.random.permutation(len(input_data)) - train_data = input_data[sep_idx[0:8*len(input_data)//10]] - test_data = input_data[sep_idx[8*len(input_data)//10:len(input_data)]] + + train_data = input_data[sep_idx[0:(100-test_percentage)*len(input_data)//100]] + test_data = input_data[sep_idx[test_percentage*len(input_data)//100:len(input_data)]] np.savetxt(pathdir+filename+"_train",train_data) - np.savetxt(pathdir+filename+"_test",test_data) + if test_data.size != 0: + np.savetxt(pathdir+filename+"_test",test_data) # Run the code on the train data PA = run_AI_all(pathdir,filename+"_train",BF_try_time,BF_ops_file_type, polyfit_deg, NN_epochs) PA_list = PA.get_pareto_points() - PA_snapped = ParetoSet() + + # Run bf snap on the resulted equations + for i in range(len(PA_list)): + try: + PA = add_bf_on_numbers_on_pareto(pathdir,filename,PA,PA_list[i][-1]) + except: + continue + PA_list = PA.get_pareto_points() np.savetxt("results/solution_before_snap_%s.txt" %filename,PA_list,fmt="%s") + + # Run zero, integer and rational snap on the resulted equations + PA_snapped_1 = ParetoSet() + for j in range(len(PA_list)): + PA_snapped_1 = add_snap_expr_on_pareto(pathdir,filename,PA_list[j][-1],PA_snapped_1, "") + + PA_list = PA_snapped_1.get_pareto_points() + np.savetxt("results/solution_first_snap_%s.txt" %filename,PA_list,fmt="%s") + + # Run gradient descent on the data one more time + for i in range(len(PA_list)): + try: + gd_update = final_gd(pathdir+filename,PA_list[i][-1]) + PA_snapped_1.add(Point(x=gd_update[1],y=gd_update[0],data=gd_update[2])) + except: + continue + + PA_list = PA_snapped_1.get_pareto_points() + + PA_snapped = ParetoSet() for j in range(len(PA_list)): PA_snapped = add_snap_expr_on_pareto(pathdir,filename,PA_list[j][-1],PA_snapped, DR_file) - - list_dt = np.array(PA_snapped.get_pareto_points()) + list_dt = np.array(PA_snapped.get_pareto_points()) data_file_len = len(np.loadtxt(pathdir+filename)) log_err = [] log_err_all = [] @@ -160,7 +208,7 @@ def run_aifeynman(pathdir,filename,BF_try_time,BF_ops_file_type, polyfit_deg=4, log_err_all = np.array(log_err_all) # Try the found expressions on the test data - if DR_file=="": + if DR_file=="" and test_data.size != 0: test_errors = [] for i in range(len(list_dt)): test_errors = test_errors + [get_symbolic_expr_error(pathdir,filename+"_test",str(list_dt[i][-1]))] @@ -169,5 +217,4 @@ def run_aifeynman(pathdir,filename,BF_try_time,BF_ops_file_type, polyfit_deg=4, save_data = np.column_stack((test_errors,log_err,log_err_all,list_dt)) else: save_data = np.column_stack((log_err,log_err_all,list_dt)) - - np.savetxt("results/solution_%s.txt" %filename,save_data,fmt="%s") + np.savetxt("results/solution_%s.txt" %filename_orig,save_data,fmt="%s") diff --git a/Code/S_run_bf_polyfit.py b/Code/S_run_bf_polyfit.py index eebcf6d..73daf10 100644 --- a/Code/S_run_bf_polyfit.py +++ b/Code/S_run_bf_polyfit.py @@ -13,6 +13,7 @@ from sympy import preorder_traversal, count_ops from S_polyfit import polyfit from S_get_symbolic_expr_error import get_symbolic_expr_error from S_add_sym_on_pareto import add_sym_on_pareto +from S_add_snap_expr_on_pareto_polyfit import add_snap_expr_on_pareto_polyfit import os from os import path @@ -25,153 +26,157 @@ def run_bf_polyfit(pathdir,pathdir_transformed,filename,BF_try_time,BF_ops_file_ print("Checking for brute force + \n") brute_force(pathdir_transformed,filename,BF_try_time,BF_ops_file_type,"+") - # load the BF output data - bf_all_output = np.loadtxt("results.dat", dtype="str") - express = bf_all_output[:,2] - prefactors = bf_all_output[:,1] - prefactors = [str(i) for i in prefactors] - - # Calculate the complexity of the bf expression the same way as for gradient descent case - complexity = [] - errors = [] - eqns = [] - for i in range(len(prefactors)): - try: - if output_type=="": - eqn = prefactors[i] + "+" + RPN_to_eq(express[i]) - elif output_type=="acos": - eqn = "cos(" + prefactors[i] + "+" + RPN_to_eq(express[i]) + ")" - elif output_type=="asin": - eqn = "sin(" + prefactors[i] + "+" + RPN_to_eq(express[i]) + ")" - elif output_type=="atan": - eqn = "tan(" + prefactors[i] + "+" + RPN_to_eq(express[i]) + ")" - elif output_type=="cos": - eqn = "acos(" + prefactors[i] + "+" + RPN_to_eq(express[i]) + ")" - elif output_type=="exp": - eqn = "log(" + prefactors[i] + "+" + RPN_to_eq(express[i]) + ")" - elif output_type=="inverse": - eqn = "1/(" + prefactors[i] + "+" + RPN_to_eq(express[i]) + ")" - elif output_type=="log": - eqn = "exp(" + prefactors[i] + "+" + RPN_to_eq(express[i]) + ")" - elif output_type=="sin": - eqn = "acos(" + prefactors[i] + "+" + RPN_to_eq(express[i]) + ")" - elif output_type=="sqrt": - eqn = "(" + prefactors[i] + "+" + RPN_to_eq(express[i]) + ")**2" - elif output_type=="squared": - eqn = "sqrt(" + prefactors[i] + "+" + RPN_to_eq(express[i]) + ")" - elif output_type=="tan": - eqn = "atan(" + prefactors[i] + "+" + RPN_to_eq(express[i]) + ")" + try: + # load the BF output data + bf_all_output = np.loadtxt("results.dat", dtype="str") + express = bf_all_output[:,2] + prefactors = bf_all_output[:,1] + prefactors = [str(i) for i in prefactors] + + # Calculate the complexity of the bf expression the same way as for gradient descent case + complexity = [] + errors = [] + eqns = [] + for i in range(len(prefactors)): + try: + if output_type=="": + eqn = prefactors[i] + "+" + RPN_to_eq(express[i]) + elif output_type=="acos": + eqn = "cos(" + prefactors[i] + "+" + RPN_to_eq(express[i]) + ")" + elif output_type=="asin": + eqn = "sin(" + prefactors[i] + "+" + RPN_to_eq(express[i]) + ")" + elif output_type=="atan": + eqn = "tan(" + prefactors[i] + "+" + RPN_to_eq(express[i]) + ")" + elif output_type=="cos": + eqn = "acos(" + prefactors[i] + "+" + RPN_to_eq(express[i]) + ")" + elif output_type=="exp": + eqn = "log(" + prefactors[i] + "+" + RPN_to_eq(express[i]) + ")" + elif output_type=="inverse": + eqn = "1/(" + prefactors[i] + "+" + RPN_to_eq(express[i]) + ")" + elif output_type=="log": + eqn = "exp(" + prefactors[i] + "+" + RPN_to_eq(express[i]) + ")" + elif output_type=="sin": + eqn = "acos(" + prefactors[i] + "+" + RPN_to_eq(express[i]) + ")" + elif output_type=="sqrt": + eqn = "(" + prefactors[i] + "+" + RPN_to_eq(express[i]) + ")**2" + elif output_type=="squared": + eqn = "sqrt(" + prefactors[i] + "+" + RPN_to_eq(express[i]) + ")" + elif output_type=="tan": + eqn = "atan(" + prefactors[i] + "+" + RPN_to_eq(express[i]) + ")" - eqns = eqns + [eqn] - errors = errors + [get_symbolic_expr_error(pathdir,filename,eqn)] - expr = parse_expr(eqn) - is_atomic_number = lambda expr: expr.is_Atom and expr.is_number - numbers_expr = [subexpression for subexpression in preorder_traversal(expr) if is_atomic_number(subexpression)] - compl = 0 - for j in numbers_expr: - try: - compl = compl + get_number_DL(float(j)) - except: - compl = compl + 1000000 + eqns = eqns + [eqn] + errors = errors + [get_symbolic_expr_error(pathdir,filename,eqn)] + expr = parse_expr(eqn) + is_atomic_number = lambda expr: expr.is_Atom and expr.is_number + numbers_expr = [subexpression for subexpression in preorder_traversal(expr) if is_atomic_number(subexpression)] + compl = 0 + for j in numbers_expr: + try: + compl = compl + get_number_DL(float(j)) + except: + compl = compl + 1000000 - # Add the complexity due to symbols - n_variables = len(expr.free_symbols) - n_operations = len(count_ops(expr,visual=True).free_symbols) - if n_operations!=0 or n_variables!=0: - compl = compl + (n_variables+n_operations)*np.log2((n_variables+n_operations)) + # Add the complexity due to symbols + n_variables = len(expr.free_symbols) + n_operations = len(count_ops(expr,visual=True).free_symbols) + if n_operations!=0 or n_variables!=0: + compl = compl + (n_variables+n_operations)*np.log2((n_variables+n_operations)) - complexity = complexity + [compl] - except: - continue + complexity = complexity + [compl] + except: + continue - for i in range(len(complexity)): - PA.add(Point(x=complexity[i], y=errors[i], data=eqns[i])) - - # run gradient descent of BF output parameters and add the results to the Pareto plot - for i in range(len(express)): - try: - bf_gd_update = RPN_to_pytorch(pathdir+filename,eqns[i]) - PA.add(Point(x=bf_gd_update[1],y=bf_gd_update[0],data=bf_gd_update[2])) - except: - continue + for i in range(len(complexity)): + PA.add(Point(x=complexity[i], y=errors[i], data=eqns[i])) + # run gradient descent of BF output parameters and add the results to the Pareto plot + for i in range(len(express)): + try: + bf_gd_update = RPN_to_pytorch(pathdir+filename,eqns[i]) + PA.add(Point(x=bf_gd_update[1],y=bf_gd_update[0],data=bf_gd_update[2])) + except: + continue + except: + pass ############################################################################################################################# # run BF on the data (*) print("Checking for brute force * \n") brute_force(pathdir_transformed,filename,BF_try_time,BF_ops_file_type,"*") - - # load the BF output data - bf_all_output = np.loadtxt("results.dat", dtype="str") - express = bf_all_output[:,2] - prefactors = bf_all_output[:,1] - prefactors = [str(i) for i in prefactors] + + try: + # load the BF output data + bf_all_output = np.loadtxt("results.dat", dtype="str") + express = bf_all_output[:,2] + prefactors = bf_all_output[:,1] + prefactors = [str(i) for i in prefactors] + + # Calculate the complexity of the bf expression the same way as for gradient descent case + complexity = [] + errors = [] + eqns = [] + for i in range(len(prefactors)): + try: + if output_type=="": + eqn = prefactors[i] + "*" + RPN_to_eq(express[i]) + elif output_type=="acos": + eqn = "cos(" + prefactors[i] + "*" + RPN_to_eq(express[i]) + ")" + elif output_type=="asin": + eqn = "sin(" + prefactors[i] + "*" + RPN_to_eq(express[i]) + ")" + elif output_type=="atan": + eqn = "tan(" + prefactors[i] + "*" + RPN_to_eq(express[i]) + ")" + elif output_type=="cos": + eqn = "acos(" + prefactors[i] + "*" + RPN_to_eq(express[i]) + ")" + elif output_type=="exp": + eqn = "log(" + prefactors[i] + "*" + RPN_to_eq(express[i]) + ")" + elif output_type=="inverse": + eqn = "1/(" + prefactors[i] + "*" + RPN_to_eq(express[i]) + ")" + elif output_type=="log": + eqn = "exp(" + prefactors[i] + "*" + RPN_to_eq(express[i]) + ")" + elif output_type=="sin": + eqn = "acos(" + prefactors[i] + "*" + RPN_to_eq(express[i]) + ")" + elif output_type=="sqrt": + eqn = "(" + prefactors[i] + "*" + RPN_to_eq(express[i]) + ")**2" + elif output_type=="squared": + eqn = "sqrt(" + prefactors[i] + "*" + RPN_to_eq(express[i]) + ")" + elif output_type=="tan": + eqn = "atan(" + prefactors[i] + "*" + RPN_to_eq(express[i]) + ")" - # Calculate the complexity of the bf expression the same way as for gradient descent case - complexity = [] - errors = [] - eqns = [] - for i in range(len(prefactors)): - try: - if output_type=="": - eqn = prefactors[i] + "*" + RPN_to_eq(express[i]) - elif output_type=="acos": - eqn = "cos(" + prefactors[i] + "*" + RPN_to_eq(express[i]) + ")" - elif output_type=="asin": - eqn = "sin(" + prefactors[i] + "*" + RPN_to_eq(express[i]) + ")" - elif output_type=="atan": - eqn = "tan(" + prefactors[i] + "*" + RPN_to_eq(express[i]) + ")" - elif output_type=="cos": - eqn = "acos(" + prefactors[i] + "*" + RPN_to_eq(express[i]) + ")" - elif output_type=="exp": - eqn = "log(" + prefactors[i] + "*" + RPN_to_eq(express[i]) + ")" - elif output_type=="inverse": - eqn = "1/(" + prefactors[i] + "*" + RPN_to_eq(express[i]) + ")" - elif output_type=="log": - eqn = "exp(" + prefactors[i] + "*" + RPN_to_eq(express[i]) + ")" - elif output_type=="sin": - eqn = "acos(" + prefactors[i] + "*" + RPN_to_eq(express[i]) + ")" - elif output_type=="sqrt": - eqn = "(" + prefactors[i] + "*" + RPN_to_eq(express[i]) + ")**2" - elif output_type=="squared": - eqn = "sqrt(" + prefactors[i] + "*" + RPN_to_eq(express[i]) + ")" - elif output_type=="tan": - eqn = "atan(" + prefactors[i] + "*" + RPN_to_eq(express[i]) + ")" - - eqns = eqns + [eqn] - errors = errors + [get_symbolic_expr_error(pathdir,filename,eqn)] - expr = parse_expr(eqn) - is_atomic_number = lambda expr: expr.is_Atom and expr.is_number - numbers_expr = [subexpression for subexpression in preorder_traversal(expr) if is_atomic_number(subexpression)] - compl = 0 - for j in numbers_expr: - try: - compl = compl + get_number_DL(float(j)) - except: - compl = compl + 1000000 + eqns = eqns + [eqn] + errors = errors + [get_symbolic_expr_error(pathdir,filename,eqn)] + expr = parse_expr(eqn) + is_atomic_number = lambda expr: expr.is_Atom and expr.is_number + numbers_expr = [subexpression for subexpression in preorder_traversal(expr) if is_atomic_number(subexpression)] + compl = 0 + for j in numbers_expr: + try: + compl = compl + get_number_DL(float(j)) + except: + compl = compl + 1000000 - # Add the complexity due to symbols - n_variables = len(expr.free_symbols) - n_operations = len(count_ops(expr,visual=True).free_symbols) - if n_operations!=0 or n_variables!=0: - compl = compl + (n_variables+n_operations)*np.log2((n_variables+n_operations)) + # Add the complexity due to symbols + n_variables = len(expr.free_symbols) + n_operations = len(count_ops(expr,visual=True).free_symbols) + if n_operations!=0 or n_variables!=0: + compl = compl + (n_variables+n_operations)*np.log2((n_variables+n_operations)) - complexity = complexity + [compl] - except: - continue + complexity = complexity + [compl] + except: + continue - # add the BF output to the Pareto plot - for i in range(len(complexity)): - PA.add(Point(x=complexity[i], y=errors[i], data=eqns[i])) - - # run gradient descent of BF output parameters and add the results to the Pareto plot - for i in range(len(express)): - try: - bf_gd_update = RPN_to_pytorch(pathdir+filename,eqns[i]) - PA.add(Point(x=bf_gd_update[1],y=bf_gd_update[0],data=bf_gd_update[2])) - except: - continue + # add the BF output to the Pareto plot + for i in range(len(complexity)): + PA.add(Point(x=complexity[i], y=errors[i], data=eqns[i])) + # run gradient descent of BF output parameters and add the results to the Pareto plot + for i in range(len(express)): + try: + bf_gd_update = RPN_to_pytorch(pathdir+filename,eqns[i]) + PA.add(Point(x=bf_gd_update[1],y=bf_gd_update[0],data=bf_gd_update[2])) + except: + continue + except: + pass ############################################################################################################################# # run polyfit on the data print("Checking polyfit \n") @@ -210,15 +215,27 @@ def run_bf_polyfit(pathdir,pathdir_transformed,filename,BF_try_time,BF_ops_file_ numbers_expr = [subexpression for subexpression in preorder_traversal(expr) if is_atomic_number(subexpression)] complexity = 0 for j in numbers_expr: - complexity = complexity + get_number_DL(float(j)) - # Add the complexity due to symbols - n_variables = len(polyfit_result[0].free_symbols) - n_operations = len(count_ops(polyfit_result[0],visual=True).free_symbols) - if n_operations!=0 or n_variables!=0: - complexity = complexity + (n_variables+n_operations)*np.log2((n_variables+n_operations)) - - PA.add(Point(x=complexity, y=polyfit_err, data=str(eqn))) + complexity = complexity + get_number_DL(float(j)) + try: + # Add the complexity due to symbols + n_variables = len(polyfit_result[0].free_symbols) + n_operations = len(count_ops(polyfit_result[0],visual=True).free_symbols) + if n_operations!=0 or n_variables!=0: + complexity = complexity + (n_variables+n_operations)*np.log2((n_variables+n_operations)) + except: + pass + + #run zero snap on polyfit output + PA_poly = ParetoSet() + PA_poly.add(Point(x=complexity, y=polyfit_err, data=str(eqn))) + PA_poly = add_snap_expr_on_pareto_polyfit(pathdir, filename, str(eqn), PA_poly) + + + for l in range(len(PA_poly.get_pareto_points())): + PA.add(Point(PA_poly.get_pareto_points()[l][0],PA_poly.get_pareto_points()[l][1],PA_poly.get_pareto_points()[l][2])) + + print("Complexity RMSE Expression") for pareto_i in range(len(PA.get_pareto_points())): print(PA.get_pareto_points()[pareto_i]) diff --git a/Code/S_symmetry.py b/Code/S_symmetry.py index b212fc6..a85598c 100644 --- a/Code/S_symmetry.py +++ b/Code/S_symmetry.py @@ -14,6 +14,7 @@ from torch.utils import data import pickle from torch.optim.lr_scheduler import CosineAnnealingLR from matplotlib import pyplot as plt +from S_remove_input_neuron import remove_input_neuron import time is_cuda = torch.cuda.is_available() @@ -156,6 +157,7 @@ def do_translational_symmetry_minus(pathdir, filename, i,j): with torch.no_grad(): file_name = filename + "-translated_minus" + ct_median = torch.median(torch.from_numpy(variables[:,j])) data_translated = variables data_translated[:,i] = variables[:,i]-variables[:,j] data_translated = np.delete(data_translated, j, axis=1) @@ -165,8 +167,9 @@ def do_translational_symmetry_minus(pathdir, filename, i,j): except: pass np.savetxt("results/translated_data_minus/"+file_name , data_translated) + remove_input_neuron(model,n_variables,j,ct_median,"results/NN_trained_models/models/"+filename + "-translated_minus_pretrained.h5") return ("results/translated_data_minus/",file_name) - + except Exception as e: print(e) return (-1,-1) @@ -286,6 +289,7 @@ def do_translational_symmetry_divide(pathdir, filename, i,j): with torch.no_grad(): file_name = filename + "-translated_divide" data_translated = variables + ct_median =torch.median(torch.from_numpy(variables[:,j])) data_translated[:,i] = variables[:,i]/variables[:,j] data_translated = np.delete(data_translated, j, axis=1) data_translated = np.column_stack((data_translated,f_dependent)) @@ -294,8 +298,9 @@ def do_translational_symmetry_divide(pathdir, filename, i,j): except: pass np.savetxt("results/translated_data_divide/"+file_name , data_translated) + remove_input_neuron(model,n_variables,j,ct_median,"results/NN_trained_models/models/"+filename + "-translated_divide_pretrained.h5") return ("results/translated_data_divide/",file_name) - + except Exception as e: print(e) return (-1,1) @@ -413,6 +418,7 @@ def do_translational_symmetry_multiply(pathdir, filename, i,j): with torch.no_grad(): file_name = filename + "-translated_multiply" data_translated = variables + ct_median =torch.median(torch.from_numpy(variables[:,j])) data_translated[:,i] = variables[:,i]*variables[:,j] data_translated = np.delete(data_translated, j, axis=1) data_translated = np.column_stack((data_translated,f_dependent)) @@ -421,8 +427,9 @@ def do_translational_symmetry_multiply(pathdir, filename, i,j): except: pass np.savetxt("results/translated_data_multiply/"+file_name , data_translated) + remove_input_neuron(model,n_variables,j,ct_median,"results/NN_trained_models/models/"+filename + "-translated_multiply_pretrained.h5") return ("results/translated_data_multiply/",file_name) - + except Exception as e: print(e) return (-1,1) @@ -539,6 +546,7 @@ def do_translational_symmetry_plus(pathdir, filename, i,j): with torch.no_grad(): file_name = filename + "-translated_plus" data_translated = variables + ct_median =torch.median(torch.from_numpy(variables[:,j])) data_translated[:,i] = variables[:,i]+variables[:,j] data_translated = np.delete(data_translated, j, axis=1) data_translated = np.column_stack((data_translated,f_dependent)) @@ -547,6 +555,7 @@ def do_translational_symmetry_plus(pathdir, filename, i,j): except: pass np.savetxt("results/translated_data_plus/"+file_name , data_translated) + remove_input_neuron(model,n_variables,j,ct_median,"results/NN_trained_models/models/"+filename + "-translated_plus_pretrained.h5") return ("results/translated_data_plus/", file_name) except Exception as e: diff --git a/Code/ai_feynman_example.py b/Code/ai_feynman_example.py index e7e23ad..940eb60 100644 --- a/Code/ai_feynman_example.py +++ b/Code/ai_feynman_example.py @@ -1,3 +1,4 @@ from S_run_aifeynman import run_aifeynman run_aifeynman("../example_data/","example3.txt",30,"14ops.txt", polyfit_deg=4, NN_epochs=400) + diff --git a/Code/ai_feynman_terminal_example.py b/Code/ai_feynman_terminal_example.py index 8fdd880..ffdaecd 100644 --- a/Code/ai_feynman_terminal_example.py +++ b/Code/ai_feynman_terminal_example.py @@ -9,6 +9,8 @@ parser.add_argument("--BF_try_time", type=float, default=60, help="Time limit fo parser.add_argument("--BF_ops_file_type", type=str, default="14ops.txt", help="File containing the symbols to be used in the brute force code") parser.add_argument("--polyfit_deg", type=int, default=4, help="Maximum degree of the polynomial tried by the polynomial fit routine") parser.add_argument("--NN_epochs", type=int, default=2000, help="Number of epochs for the training") +parser.add_argument("--vars_name", type=list, default=[], help="List with the names of the variables") +parser.add_argument("--test_percentage", type=float, default=0, help="Percentage of the input data to be kept as the test set") opts = parser.parse_args() diff --git a/Code/brute_force_oneFile_v1.scr b/Code/brute_force_oneFile_v1.scr new file mode 100644 index 0000000..0b74953 --- /dev/null +++ b/Code/brute_force_oneFile_v1.scr @@ -0,0 +1,19 @@ +#!/bin/csh +# USAGE EXAMPLE: solve_mysteries.scr ops6.txt 2 +# USAGE EXAMPLE: solve_mysteries.scr allops.txt 1800 +set opsfile = $1 +set maxtime = $2 +set f = $3 + +set outfile = brute_solutions.dat +set outfile2 = brute_formulas.dat +if -f $outfile /bin/rm $outfile +if -f $outfile2 /bin/rm $outfile2 + +echo Trying to solve mysteries with brute force... + +echo Trying to solve $f... +echo /bin/cp -p $f mystery.dat +/bin/cp -p $f mystery.dat +echo $opsfile arity2templates.txt mystery.dat results.dat >args.dat +timeout {$maxtime}s ./symbolic_regress1.x diff --git a/Code/compile.sh b/Code/compile.sh index e6c23f9..3a0e966 100644 --- a/Code/compile.sh +++ b/Code/compile.sh @@ -1,3 +1,3 @@ -gfortran -ffixed-line-length-none -O3 -o symbolic_regress.x symbolic_regress.f +gfortran -ffixed-line-length-none -O3 -o symbolic_regress1.x symbolic_regress1.f gfortran -ffixed-line-length-none -O3 -o symbolic_regress2.x symbolic_regress2.f gfortran -ffixed-line-length-none -O3 -o symbolic_regress3.x symbolic_regress3.f diff --git a/Code/dimensionalAnalysis.py b/Code/dimensionalAnalysis.py new file mode 100644 index 0000000..b3f2036 --- /dev/null +++ b/Code/dimensionalAnalysis.py @@ -0,0 +1,129 @@ +import numpy as np +import pandas as pd +from scipy.sparse.linalg import lsqr +from scipy.linalg import * +from sympy import Matrix +from sympy import symbols, Add, Mul, S +from getPowers import getPowers + +def dimensional_analysis(input,output,units): + M = units[input[0]] + for i in range(1,len(input)): + M = np.c_[M, units[input[i]]] + if len(input)==1: + M = np.array(M) + M = np.reshape(M,(len(M),1)) + params = getPowers(M,units[output]) + M = Matrix(M) + B = M.nullspace() + return (params, B) + +# load the data from a file +def load_data(pathdir, filename): + n_variables = np.loadtxt(pathdir+filename, dtype='str').shape[1]-1 + variables = np.loadtxt(pathdir+filename, usecols=(0,)) + for i in range(1,n_variables): + v = np.loadtxt(pathdir+filename, usecols=(i,)) + variables = np.column_stack((variables,v)) + f_dependent = np.loadtxt(pathdir+filename, usecols=(n_variables,)) + return(variables.T,f_dependent) + +def dimensionalAnalysis(pathdir, filename, eq_symbols): + file = pd.read_excel("units.xlsx") + + units = {} + for i in range(len(file["Variable"])): + val = [file["m"][i],file["s"][i],file["kg"][i],file["T"][i],file["V"][i],file["cd"][i]] + val = np.array(val) + units[file["Variable"][i]] = val + + dependent_var = eq_symbols[-1] + + file_sym = open(filename + "_dim_red_variables.txt" ,"w") + file_sym.write(filename) + file_sym.write(", ") + + # load the data corresponding to the first line (from mystery_world) + varibs = load_data(pathdir,filename)[0] + deps = load_data(pathdir,filename)[1] + + # get the data in symbolic form and associate the corresponding values to it + input = [] + for i in range(len(eq_symbols)-1): + input = input + [eq_symbols[i]] + vars()[eq_symbols[i]] = varibs[i] + output = dependent_var + + # Check if all the independent variables are dimensionless + ok = 0 + for j in range(len(input)): + if(units[input[j]].any()): + ok=1 + + if ok==0: + dimless_data = load_data(pathdir, filename)[0].T + dimless_dep = load_data(pathdir, filename)[1] + if dimless_data.ndim==1: + dimless_data = np.reshape(dimless_data,(1,len(dimless_data))) + dimless_data = dimless_data.T + np.savetxt(pathdir + filename + "_dim_red", dimless_data) + file_sym.write(", ") + for j in range(len(input)): + file_sym.write(str(input[j])) + file_sym.write(", ") + file_sym.write("\n") + else: + # get the symbolic form of the solved part + solved_powers = dimensional_analysis(input,output,units)[0] + input_sym = symbols(input) + sol = symbols("sol") + sol = 1 + for i in range(len(input_sym)): + sol = sol*input_sym[i]**np.round(solved_powers[i],2) + file_sym.write(str(sol)) + file_sym.write(", ") + + # get the symbolic form of the unsolved part + unsolved_powers = dimensional_analysis(input,output,units)[1] + + #print(unsolved_powers,unsolved_powers[0]) + uns = symbols("uns") + unsolved = [] + for i in range(len(unsolved_powers)): + uns = 1 + for j in range(len(unsolved_powers[i])): + uns = uns*input_sym[j]**unsolved_powers[i][j] + file_sym.write(str(uns)) + file_sym.write(", ") + unsolved = unsolved + [uns] + file_sym.write("\n") + + # get the discovered part of the function + func = 1 + for j in range(len(input)): + func = func * vars()[input[j]]**dimensional_analysis(input,output,units)[0][j] + func = np.array(func) + + # get the new variables needed + new_vars = [] + for i in range(len(dimensional_analysis(input,output,units)[1])): + nv = 1 + for j in range(len(input)): + nv = nv*vars()[input[j]]**dimensional_analysis(input,output,units)[1][i][j] + new_vars = new_vars + [nv] + + new_vars = np.array(new_vars) + new_dependent = deps/func + + if new_vars.size==0: + np.savetxt(pathdir + filename + "_dim_red", new_dependent) + + # save this to file + all_variables = np.vstack((new_vars, new_dependent)).T + np.savetxt(pathdir + filename + "_dim_red", all_variables) + + file_sym.close() + + +#print(dimensionalAnalysis("../_noise_data/", "119_1.24.6", ["m","omega","omega_0","x","E_n"])) + diff --git a/Code/getPowers.py b/Code/getPowers.py new file mode 100644 index 0000000..0efaacf --- /dev/null +++ b/Code/getPowers.py @@ -0,0 +1,62 @@ +import numpy as np +import pandas as pd +from scipy.sparse.linalg import lsqr +from scipy.linalg import * +from sympy import Matrix +from sympy import symbols, Add, Mul, S +from numpy.linalg import matrix_rank +from itertools import combinations + + +N = np.array([[ 0, 1, 1], + [ 0, -1, -1], + [ 1, 0, 0], + [ 0, 0, 0], + [ 0, 0, 0], + [ 0, 0, 0],]) + +N = np.array([[ 0, 0, 3, 1, 1, 1, 1, 1, 1], + [ 0, 0, -2, 0, 0, 0, 0, 0, 0], + [ 1, 1, -1, 0, 0, 0, 0, 0, 0], + [ 0, 0, 0, 0, 0, 0, 0, 0, 0], + [ 0, 0, 0, 0, 0, 0, 0, 0, 0], + [ 0, 0, 0, 0, 0, 0, 0, 0, 0]]) + +a = np.array([ 1, -2, 1, 0, 0, 0]) + +def getPowers(N,a): + rand_drop_cols = np.arange(0,len(N[0]),1) + rand_drop_rows = np.arange(0,len(N),1) + rand_drop_rows = np.flip(rand_drop_rows) + rank = matrix_rank(N) + d_cols = list(combinations(rand_drop_cols,len(N[0])-rank)) + d_rows = list(combinations(rand_drop_rows,len(N)-rank)) + for i in d_cols: + M = N + M = np.delete(M,i,1) + M = np.transpose(M) + for j in d_rows: + P = M + P = np.delete(P,j,1) + if np.linalg.det(P)!=0: + solved_M = np.transpose(P) + indices_sol = j + indices_powers = i + break + + b = np.delete(a,indices_sol) + params = np.linalg.solve(solved_M,b) + + sol = [] + for i in range(len(N[0])): + if i in indices_powers: + sol = sol + [0] + else: + sol = sol + [params[0]] + params = np.delete(params,0) + + # this is the solution: + sol = np.array(sol) + return(sol) + + diff --git a/Code/symbolic_regress1.f b/Code/symbolic_regress1.f new file mode 100644 index 0000000..8dad3af --- /dev/null +++ b/Code/symbolic_regress1.f @@ -0,0 +1,270 @@ + ! Max Tegmark 171119, 190128-31, 190506 + ! Loads templates.csv functions.dat and mystery.dat, returns winner. + ! scp -P2222 symbolic_regress1.f euler@tor.mit.edu:FEYNMAN + ! COMPILATION: a f 'f77 -O3 -o symbolic_regress1.x symbolic_regress1.f |& more' + ! SAMPLE USAGE: call symbolic_regress1.x 10ops.txt arity2templates.txt mystery_constant.dat results.dat + ! functions.dat contains a single line (say "0>+*-/") with the single-character symbols + ! that will be used, drawn from this list: + ! + ! Binary: + ! +: add + ! *: multiply + ! -: subtract + ! /: divide (Put "D" instead of "/" in file, since f77 can't load backslash + ! + ! Unary: + ! >: increment (x -> x+1) + ! <: decrement (x -> x-1) + ! ~: negate (x-> -x) + ! \: invert (x->1/x) (Put "I" instead of "\" in file, since f77 can't load backslash + ! L: logaritm (x-> ln(x) + ! E: exponentiate (x->exp(x)) + ! S: sin: (x->sin(x)) + ! C: cos: (x->cos(x)) + ! A: abs: (x->abs(x)) + ! N: arcsin (x->arcsin(x)) + ! T: arctan (x->arctan(x)) + ! R: sqrt (x->sqrt(x)) + ! O: double (x->2*x); note that this is the letter "O", not zero + ! J: double+1 (x->2*x+1) + ! nonary: + ! 0 + ! 1 + ! P = pi + ! a, b, c, ...: input variables for function (need not be listed in functions.dat) + + program symbolic_regress + call go + end + + subroutine go + implicit none + character*60 opsfile, templatefile, mysteryfile, outfile, usedfuncs + character*60 comline, functions, ops, formula + integer arities(21), nvar, nvarmax, nmax, lnblnk + parameter(nvarmax=20, nmax=10000000) + real*8 f, newloss, minloss, maxloss, rmsloss, xy(nvarmax+1,nmax), epsilon, DL, DL2, DL3 + parameter(epsilon=0.00000001) + data arities /2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0/ + data functions /"+*-/><~\OJLESCANTR01P"/ + integer nn(0:2), ii(nmax), kk(nmax), radix(nmax) + integer ndata, i, j, n + integer*8 nformulas + logical done + character*60 func(0:2), template + + open(2,file='args.dat',status='old',err=666) + read(2,*) opsfile, templatefile, mysteryfile, outfile + close(2) + + nvar = 0 + write(*,'(1a24,i8)') 'Number of variables.....',nvar + + open(2,file=opsfile,status='old',err=668) + read(2,*) usedfuncs + close(2) + nn(0)=0 + nn(1)=0 + nn(2)=0 + do i=1,lnblnk(usedfuncs) + if (usedfuncs(i:i).eq.'D') usedfuncs(i:i)='/' + if (usedfuncs(i:i).eq.'I') usedfuncs(i:i)='\' + j = index(functions,usedfuncs(i:i)) + if (j.eq.0) then + print *,'DEATH ERROR: Unknown function requested: ',usedfuncs(i:i) + stop + else + nn(arities(j)) = nn(arities(j)) + 1 + func(arities(j))(nn(arities(j)):nn(arities(j))) = functions(j:j) + end if + end do + ! Add nonary ops to retrieve each of the input variables: + do i=1,nvar + nn(0) = nn(0) + 1 + func(0)(nn(0):nn(0)) = char(96+i) + end do + write(*,'(1a24,1a22)') 'Functions used..........',usedfuncs(1:lnblnk(usedfuncs)) + do i=0,2 + write(*,*) 'Arity ',i,': ',func(i)(1:nn(i)) + end do + + write(*,'(1a24)') 'Loading mystery data....' + call LoadMatrixTranspose(nvarmax+1,nvar+1,nmax,ndata,xy,mysteryfile) + write(*,'(1a24,i8)') 'Number of examples......',ndata + + print *,'Searching for best fit...' + nformulas = 0 + minloss = 1.e6 + template = '' + ops='====================' + open(2,file=templatefile,status='old',err=670) + open(3,file=outfile) +555 read(2,'(1a60)',end=665) template + n = lnblnk(template) + !print *,"template:",template(1:n),"#####" + do i=1,n + ii(i) = ichar(template(i:i))-48 + radix(i) = nn(ii(i)) + kk(i) = 0 + !print *,'ASILOMAR ', i,ii(i),kk(i),radix(i) + end do + done = .false. + do while ((minloss.gt.epsilon).and.(.not.done)) + nformulas = nformulas + 1 + ! Analyze structure ii: + do i=1,n + ops(i:i) = func(ii(i))(1+kk(i):1+kk(i)) + !print *,'TEST ',i,ii(i), func(ii(i)) + end do + !write(*,'(1f20.12,99i3)') minloss, (ii(i),i=1,n), (kk(i),i=1,n) + !write(*,'(1a24)') ops(1:n) + j = 1 + maxloss = 0. + do while ((maxloss.lt.minloss).and.(j.le.ndata)) + newloss = abs(xy(nvar+1,j) - f(n,ii,ops,xy(1,j))) + !!!!!print *,'newloss: ',j,newloss,xy(nvar,j),f(n,ii,ops,xy(1,j)) + if (.not.((newloss.ge.0).or.(newloss.le.0))) newloss = 1.e30 ! This was a NaN :-) + if (maxloss.lt.newloss) maxloss = newloss + j = j + 1 + end do + if (maxloss.lt.minloss) then ! We have a new best fit + minloss = maxloss + rmsloss = 0. + do j=1,ndata + rmsloss = rmsloss + (xy(nvar+1,j) - f(n,ii,ops,xy(1,j)))**2 + end do + rmsloss = sqrt(rmsloss/ndata) + DL = log(nformulas*max(1.,rmsloss/epsilon))/log(2.) + DL2 = log(nformulas*max(1.,rmsloss/1.e-15))/log(2.) + DL3 = (log(1.*nformulas) + sqrt(1.*ndata)*log(max(1.,rmsloss/1.e-15)))/log(2.) + write(*,'(1f20.12,x,1a22,1i16,4f19.4)') minloss, ops(1:n), nformulas, rmsloss, DL, DL2, DL3 + write(3,'(1f20.12,x,1a22,1i16,4f19.4)') minloss, ops(1:n), nformulas, rmsloss, DL, DL2, DL3 + flush(3) + end if + call multiloop(n,radix,kk,done) + end do + goto 555 +665 close(3) + close(2) + print *,'All done: results in ',outfile + return +666 stop 'DEATH ERROR: missing file args.dat' +668 print *,'DEATH ERROR: missing file ',opsfile(1:lnblnk(opsfile)) + stop +670 print *,'DEATH ERROR: missing file ',templatefile(1:lnblnk(templatefile)) + stop + end + + + + real*8 function f(n,arities,ops,x) ! n=number of ops, x=arg vector + implicit none + integer nmax, n, i, j, arities(n), arity, lnblnk + character*60 ops + parameter(nmax=100) + real*8 x(nmax), y, stack(nmax) + character op + !write(*,*) 'Evaluating function with ops = ',ops(1:n) + !write(*,'(3f10.5,99i3)') (x(i),i=1,3), (arities(i),i=1,n) + j = 0 ! Number of numbers on the stack + do i=1,n + arity = arities(i) + op = ops(i:i) + if (arity.eq.0) then ! This is a nonary function + if (op.eq."0") then + y = 0. + else if (op.eq."1") then + y = 1. + else if (op.eq."P") then + y = 4.*atan(1.) ! pi + else + y = x(ichar(op)-96) + end if + else if (arity.eq.1) then ! This is a unary function + if (op.eq.">") then + y = stack(j) + 1 + else if (op.eq."<") then + y = stack(j) - 1 + else if (op.eq."~") then + y = -stack(j) + else if (op.eq."\") then + y = 1./stack(j) + else if (op.eq."L") then + y = log(stack(j)) + else if (op.eq."E") then + y = exp(stack(j)) + else if (op.eq."S") then + y = sin(stack(j)) + else if (op.eq."C") then + y =cos(stack(j)) + else if (op.eq."A") then + y = abs(stack(j)) + else if (op.eq."N") then + y = asin(stack(j)) + else if (op.eq."T") then + y = atan(stack(j)) + else if (op.eq."O") then + y = 2.*stack(j) + else if (op.eq."J") then + y = 1+2.*stack(j) + else + y = sqrt(stack(j)) + end if + else ! This is a binary function + if (op.eq."+") then + y = stack(j-1)+stack(j) + else if (op.eq."-") then + y = stack(j-1)-stack(j) + else if (op.eq."*") then + y = stack(j-1)*stack(j) + else + y = stack(j-1)/stack(j) + end if + end if + j = j + 1 - arity + stack(j) = y + ! write(*,'(9f10.5)') (stack(k),k=1,j) + end do + if (j.ne.1) stop 'DEATH ERROR: STACK UNBALANCED' + f = stack(1) + !write(*,'(9f10.5)') 666.,x(1),x(2),x(3),f + return + end + + subroutine multiloop(n,bases,i,done) + ! Handles nested loops with loop variables i(1),...i(n). + ! Example: With n=3, bases=2, repeated calls starting with i=(000) will return + ! 001, 010, 011, 100, 101, 110, 111, 000 (and done=.true. the last time). + ! All it's doing is counting in mixed radix specified by the array . + implicit none + integer n, bases(n), i(n), k + logical done + done = .false. + k = 1 +555 i(k) = i(k) + 1 + if (i(k).lt.bases(k)) return + i(k) = 0 + k = k + 1 + if (k.le.n) goto 555 + done = .true. + return + end + + subroutine LoadMatrixTranspose(nd,n,mmax,m,A,f) + ! Reads the n x m matrix A from the file named f, stored as its transpose + implicit none + integer nd,mmax,n,m,j + real*8 A(nd,mmax) + character*60 f + open(2,file=f,status='old') + m = 0 +555 m = m + 1 + if (m.gt.mmax) stop 'DEATH ERROR: m>mmax in LoadVectorTranspose' + read(2,*,end=666) (A(j,m),j=1,n) + goto 555 +666 close(2) + m = m - 1 + print *,m,' rows read from file ',f + return + end +