Preparing for Data Mining – Row Sampling
You can run the code in SSMS or Excute SQL Task in SSIS.
USE AdventureWorksDW
GO
–Extracting 20% of data for test data
–20 percent = 0.2; 30%=0.3
SELECT *
INTO TK445_Ch09_TestSet
FROM vTargetMail
WHERE RAND(CHECKSUM(NEWID())%1000000000+CustomerKey) < 0.2
GO
— 80% training set
SELECT *
INTO TK445_Ch09_TrainingSet
FROM vTargetMail
EXCEPT
SELECT *
FROM TK445_Ch09_TestSet
GO
— Check if we get back 18484 cases with UNION ALL
SELECT *
FROM TK445_Ch09_TrainingSet
UNION ALL
SELECT *
FROM TK445_Ch09_TestSet
GO
— Check the first four moments
— Now check them on training & test data sets
SELECT Mean = MIN(m.mean),
[StDev] = MIN(m.[StDev]),
Skew = SUM(((Age*1.0-m.mean)/m.[StDev])*((Age*1.0-m.mean)/m.[StDev])*((Age*1.0-m.mean)/m.[StDev])) * MIN(m.corrfact1),
Kurt = SUM(SQUARE(SQUARE(((Age*1.0-m.mean)/m.[StDev])))) * MIN(m.corrfact2) – MIN(m.subfact)
FROM TK445_Ch09_TrainingSet v CROSS JOIN
(SELECT mean = AVG(Age*1.0), [StDev] = STDEV(Age),
corrfact1 = COUNT(*)*1.0 / (COUNT(*)-1) / (COUNT(*)-2),
corrfact2 = COUNT(*)*1.0 * (COUNT(*)+1) / (COUNT(*)-1) / (COUNT(*)-2) / (COUNT(*)-3),
subfact = 3.0 * SQUARE((COUNT(*)-1)) / (COUNT(*)-2) / (COUNT(*)-3)
FROM TK445_Ch09_TrainingSet v) AS m
UNION
SELECT Mean = MIN(m.mean),
[StDev] = MIN(m.[StDev]),
Skew = SUM(((Age*1.0-m.mean)/m.[StDev])*((Age*1.0-m.mean)/m.[StDev])*((Age*1.0-m.mean)/m.[StDev])) * MIN(m.corrfact1),
Kurt = SUM(SQUARE(SQUARE(((Age*1.0-m.mean)/m.[StDev])))) * MIN(m.corrfact2) – MIN(m.subfact)
FROM TK445_Ch09_TestSet v CROSS JOIN
(SELECT mean = AVG(Age*1.0), [StDev] = STDEV(Age),
corrfact1 = COUNT(*)*1.0 / (COUNT(*)-1) / (COUNT(*)-2),
corrfact2 = COUNT(*)*1.0 * (COUNT(*)+1) / (COUNT(*)-1) / (COUNT(*)-2) / (COUNT(*)-3),
subfact = 3.0 * SQUARE((COUNT(*)-1)) / (COUNT(*)-2) / (COUNT(*)-3)
FROM TK445_Ch09_TestSet v) AS m
GO
— we can see there are no significant differences, so split is done well