GPs on Non-Euclidean Input Spaces#
GPs on non-Euclidean input spaces have become more and more relevant in recent years, especially for Bayesian Optimization in chemistry. gpCAM can be used for that purpose as long as a correct kernel is defined. Of course, if mean and noise functions are also provided, they have to operate on these non-Euclidean spaces as well.
In this example, we run a small GP on words.
# Install the newest version of gpcam
#!pip install gpcam==8.3.5
import numpy as np
import matplotlib.pyplot as plt
from gpcam import GPOptimizer
from gpcam.kernels import *
%load_ext autoreload
%autoreload 2
#making the x_data a set will allow us to put any objects or structures into it.
x_data = [('hello'),('world'),('this'),('is'),('gpcam')]
y_data = np.array([2.,1.9,1.8,3.0,5.])
def string_distance(string1, string2):
difference = abs(len(string1) - len(string2))
common_length = min(len(string1),len(string2))
string1 = string1[0:common_length]
string2 = string2[0:common_length]
for i in range(len(string1)):
if string1[i] != string2[i]:
difference += 1.
return difference
def kernel(x1,x2,hps):
d = np.zeros((len(x1),len(x2)))
count1 = 0
for string1 in x1:
count2 = 0
for string2 in x2:
d[count1,count2] = string_distance(string1,string2)
count2 += 1
count1 += 1
return hps[0] * matern_kernel_diff1(d,hps[1])
my_gp = GPOptimizer(x_data,y_data, init_hyperparameters=np.ones((2)),
kernel_function=kernel)
bounds = np.array([[0.001,100.],[0.001,100]])
my_gp.train(hyperparameter_bounds=bounds)
print("hyperparameters: ", my_gp.hyperparameters)
print("prediction : ",my_gp.posterior_mean(['full'])["m(x)"])
print("uncertainty: ",np.sqrt(my_gp.posterior_covariance(['full'])["v(x)"]))
/home/marcus/VirtualEnvironments/gpcam_dev/lib/python3.11/site-packages/fvgp/gp.py:310: UserWarning: No noise function or measurement noise provided. Noise variances will be set to (0.01 * mean(|y_data|))^2.
self.likelihood = GPlikelihood(self.data,
hyperparameters: [49.31361296 26.90240104]
prediction : 2.0046357760678624
uncertainty: [0.97016186]
##which one should I measure next?
my_gp.ask([('who'),('could'),("it"),("be")], n = 4)
{'x': array([['could'],
['who'],
['be'],
['it']], dtype='<U5'),
'f_a(x)': array([0.82710967, 0.8218738 , 0.65115158, 0.43813236]),
'opt_obj': None}
Non-Euclidean Input Spaces with Multiple Tasks#
import numpy as np
import matplotlib.pyplot as plt
from gpcam import fvGPOptimizer
import plotly.graph_objects as go
from itertools import product
%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use:
%reload_ext autoreload
x_data = ['frf','ferfe','ferf','febhn']
y_data = np.zeros((len(x_data),5))
y_data[:,0] = np.random.rand(len(x_data))
y_data[:,1] = np.random.rand(len(x_data))
y_data[:,2] = np.random.rand(len(x_data))
y_data[:,3] = np.random.rand(len(x_data))
y_data[:,4] = np.random.rand(len(x_data))
#it is good practice to check the format of the data
print(len(x_data))
print(y_data.shape)
4
(4, 5)
def string_distance(string1, string2):
difference = abs(len(string1) - len(string2))
common_length = min(len(string1),len(string2))
string1 = string1[0:common_length]
string2 = string2[0:common_length]
for i in range(len(string1)):
if string1[i] != string2[i]:
difference += 1.
return difference
from gpcam.kernels import matern_kernel_diff1
def kernel(x1,x2,hps):
d = np.zeros((len(x1),len(x2)))
count1 = 0
for entry in x1:
string1 = entry[0]
count2 = 0
for entry2 in x2:
string2 = entry2[0]
d[count1,count2] = string_distance(string1,string2)
count2 += 1
count1 += 1
return hps[0] * matern_kernel_diff1(d,hps[1])
bounds = np.array([[0.001,100.],[0.001,100]])
my_gp2 = fvGPOptimizer(x_data,y_data,init_hyperparameters=np.ones((2)),
kernel_function=kernel
)
print("Global Training in progress")
#use the next two lines if kernel `mkernel` is used
#if not a default deep kernel will be used that will set initi hyperparameters and bounds
#hps_bounds = np.array([[0.001,10000.],[1.,1000.]])
#my_gp2.train(hyperparameter_bounds = hps_bounds, max_iter = 2)
#use this next line if the default (deep) kernel is used (no bounds required)
my_gp2.train(hyperparameter_bounds=bounds, max_iter = 20)
Global Training in progress
array([ 2.54827723, 15.05228436])
x_pred = ["dwed","dwe"]
my_gp2.posterior_mean(x_pred, x_out = np.array([0,1,2,3]))
{'x': ['dwed', 'dwe'],
'm(x)': array([[0.25466475, 0.25466475, 0.25466475, 0.25466475],
[0.29003775, 0.29003775, 0.29003775, 0.29003775]]),
'm(x)_flat': array([0.25466475, 0.29003775, 0.25466475, 0.29003775, 0.25466475,
0.29003775, 0.25466475, 0.29003775]),
'x_pred': [['dwed', np.int64(0)],
['dwe', np.int64(0)],
['dwed', np.int64(1)],
['dwe', np.int64(1)],
['dwed', np.int64(2)],
['dwe', np.int64(2)],
['dwed', np.int64(3)],
['dwe', np.int64(3)]]}
my_gp2.ask([('who'),('could'),("it"),("be")], n = 4, x_out=np.array([0,1,2,3]), vectorized=True)
{'x': array([['could'],
['who'],
['it'],
['be']], dtype='<U5'),
'f_a(x)': array([1.46161733, 0.72355182, 0.72355182, 0.28535747]),
'opt_obj': None}