Machine Learning Part 4 — Linear regression in T-SQL

This is the fourth part of the ML series. For your convenience you can find other parts in the table of contents in Part 1 – Linear regression in MXNet

This time we are going to implement linear regression as a function. This gives us a little more flexibility in terms of debugging the code and reading it later, also, we can implement much more complex algorithms. Too bad, we can’t use this in Redshift at this time as it doesn’t support such functions or stored procedures. So I will use T-SQL and test the code with MS SQL 2017. I assume you have table samples with Iris dataset.

We start with declaring a type for the function parameter:

CREATE TYPE SamplesTable 
AS TABLE (id int, feature int, value float, target float)

1 2	CREATE TYPE SamplesTable AS TABLE (id int, feature int, value float, target float)

Next, let’s prepare samples for training:

DECLARE @numbers TABLE (N int)

INSERT INTO @numbers SELECT TOP 5 row_number() OVER(ORDER BY t1.number) AS N FROM master..spt_values AS t1 CROSS JOIN master..spt_values AS t2

DECLARE @samples TABLE(
	sepal_length float
	,sepal_width float
	,petal_length float
	,petal_width float
	,iris varchar(255)
	,is_setosa float
	,is_virginica float
	,sample_id int
)

INSERT INTO @samples SELECT TOP 100 S.*,
CASE WHEN S.iris = 'setosa' THEN 1.0 ELSE 0.0 END AS is_setosa, 
CASE WHEN S.iris = 'virginica' THEN 1.0 ELSE 0.0 END AS is_virginica,
row_number() OVER(ORDER BY (SELECT NULL)) AS sample_id
FROM samples AS S ORDER BY (SELECT ABS(CHECKSUM(NewId()))) 

DECLARE @samplesPivoted SamplesTable

INSERT INTO @samplesPivoted 
SELECT
	S.sample_id,
	N.N,
	CASE
		WHEN N.N = 1 THEN S.sepal_width
		WHEN N.N = 2 THEN S.petal_length
		WHEN N.N = 3 THEN S.petal_width
		WHEN N.N = 4 THEN S.is_setosa
		ELSE S.is_virginica
	END,
	S.sepal_length
FROM @samples AS S CROSS JOIN @numbers AS N

DECLARE @numbers TABLE (N int)

INSERT INTO @numbers SELECT TOP 5 row_number() OVER(ORDER BY t1.number) AS N FROM master..spt_values AS t1 CROSS JOIN master..spt_values AS t2

DECLARE @samples TABLE(

sepal_length float

,sepal_width float

,petal_length float

,petal_width float

,iris varchar(255)

,is_setosa float

,is_virginica float

,sample_id int

)

INSERT INTO @samples SELECT TOP 100 S.*,

CASE WHEN S.iris = 'setosa' THEN 1.0 ELSE 0.0 END AS is_setosa,

CASE WHEN S.iris = 'virginica' THEN 1.0 ELSE 0.0 END AS is_virginica,

row_number() OVER(ORDER BY (SELECT NULL)) AS sample_id

FROM samples AS S ORDER BY (SELECT ABS(CHECKSUM(NewId())))

DECLARE @samplesPivoted SamplesTable

INSERT INTO @samplesPivoted

SELECT

S.sample_id,

N.N,

CASE

WHEN N.N = 1 THEN S.sepal_width

WHEN N.N = 2 THEN S.petal_length

WHEN N.N = 3 THEN S.petal_width

WHEN N.N = 4 THEN S.is_setosa

ELSE S.is_virginica

END,

S.sepal_length

FROM @samples AS S CROSS JOIN @numbers AS N

We generate table with numbers, next add more features, and then pivot them just like in the last part.

Finally, our function:

CREATE FUNCTION Train(@samplesPivoted SamplesTable READONLY)
RETURNS @coefficients TABLE(feature int, w float, b float, mse float)
AS
BEGIN
    DECLARE @featureIds TABLE(feature int)
	INSERT INTO @featureIds SELECT DISTINCT feature from @samplesPivoted

	INSERT INTO @coefficients SELECT feature, 0.0, 0.0, -1.0 FROM @featureIds

	DECLARE @gradients TABLE(feature int, gw float, gb float)
	INSERT INTO @gradients SELECT feature, 0.0, 0.0 FROM @featureIds

	DECLARE @learningRate float
	SELECT @learningRate = 0.01

	DECLARE @iterations int
	SELECT @iterations = 500

	DECLARE @currentIteration int
	SELECT @currentIteration = 0

	DECLARE @newCoefficients TABLE(feature int, w float, b float)
	DECLARE @distances TABLE(id int, distance float)
	DECLARE @mse float

	WHILE @currentIteration < @iterations
	BEGIN
		DELETE FROM @newCoefficients
		INSERT INTO @newCoefficients SELECT C.feature, C.w - @learningRate * G.gw, C.b - @learningRate * G.gb FROM @coefficients AS C JOIN @gradients AS G ON C.feature = G.feature

		DELETE FROM @distances;

		INSERT INTO @distances SELECT 
			S.id, 
			SUM(N.w * S.value + N.b) - MAX(S.target)
		FROM 
			@samplesPivoted AS S
			JOIN @newCoefficients AS N ON S.feature = N.feature
		GROUP BY S.id

		SELECT @mse = AVG(D.distance * D.distance) FROM @distances AS D
		
		DELETE FROM @gradients;

		INSERT INTO @gradients SELECT
			S.feature,
			AVG(S.value * D.distance),
			AVG(D.distance)
		FROM 
			@samplesPivoted AS S
			JOIN @distances AS D ON S.id = D.id
		GROUP BY S.feature

		DELETE FROM @coefficients;

		INSERT INTO @coefficients SELECT *, @mse FROM @newCoefficients
		
		SELECT @currentIteration = @currentIteration + 1
	END

	RETURN
END

CREATE FUNCTION Train(@samplesPivoted SamplesTable READONLY)

RETURNS @coefficients TABLE(feature int, w float, b float, mse float)

BEGIN

DECLARE @featureIds TABLE(feature int)

INSERT INTO @featureIds SELECT DISTINCT feature from @samplesPivoted

INSERT INTO @coefficients SELECT feature, 0.0, 0.0, -1.0 FROM @featureIds

DECLARE @gradients TABLE(feature int, gw float, gb float)

INSERT INTO @gradients SELECT feature, 0.0, 0.0 FROM @featureIds

DECLARE @learningRate float

SELECT @learningRate = 0.01

DECLARE @iterations int

SELECT @iterations = 500

DECLARE @currentIteration int

SELECT @currentIteration = 0

DECLARE @newCoefficients TABLE(feature int, w float, b float)

DECLARE @distances TABLE(id int, distance float)

DECLARE @mse float

WHILE @currentIteration < @iterations

BEGIN

DELETE FROM @newCoefficients

INSERT INTO @newCoefficients SELECT C.feature, C.w - @learningRate * G.gw, C.b - @learningRate * G.gb FROM @coefficients AS C JOIN @gradients AS G ON C.feature = G.feature

DELETE FROM @distances;

INSERT INTO @distances SELECT

S.id,

SUM(N.w * S.value + N.b) - MAX(S.target)

FROM

@samplesPivoted AS S

JOIN @newCoefficients AS N ON S.feature = N.feature

GROUP BY S.id

SELECT @mse = AVG(D.distance * D.distance) FROM @distances AS D

DELETE FROM @gradients;

INSERT INTO @gradients SELECT

S.feature,

AVG(S.value * D.distance),

AVG(D.distance)

FROM

@samplesPivoted AS S

JOIN @distances AS D ON S.id = D.id

GROUP BY S.feature

DELETE FROM @coefficients;

INSERT INTO @coefficients SELECT *, @mse FROM @newCoefficients

SELECT @currentIteration = @currentIteration + 1

END

RETURN

END

We extract featureIds so we can pass basically any dataset for training and it should work. We initialize coefficients with default values, do the same with gradients, and prepare some bookkeeping like iterations count or learning rate.

Next, in every iteration we start with calculating new coefficients based on old coefficients and old gradients. We clear distances table and calculate distance (which is the difference between predicted value and expected value) for each sample. Next, we calculate mean squared error.

Next, we need to calculate new gradients. For each feature we calculate the derivatives and we are done. We just need to store new coefficients and increase the counter.

Now we can execute the code:

SELECT * FROM Train(@samplesPivoted)

1	SELECT * FROM Train(@samplesPivoted)

And the result is:

feature     w                      b                      mse
----------- ---------------------- ---------------------- ----------------------
1           0.746997439342549      0.282176586393152      0.098274347087078
2           0.563235001391582      0.282176586393152      0.098274347087078
3           0.0230764649956309     0.282176586393152      0.098274347087078
4           0.193704294614636      0.282176586393152      0.098274347087078
5           -0.110068224303597     0.282176586393152      0.098274347087078

feature w b mse

----------- ---------------------- ---------------------- ----------------------

1 0.746997439342549 0.282176586393152 0.098274347087078

2 0.563235001391582 0.282176586393152 0.098274347087078

3 0.0230764649956309 0.282176586393152 0.098274347087078

4 0.193704294614636 0.282176586393152 0.098274347087078

5 -0.110068224303597 0.282176586393152 0.098274347087078