Friday, 20 May 2016

Sqoop & SQL Server #1 - An Overview




Apache Sqoop is an open source tool that allows users to extract data from a structured data store into Hadoop for further processing. Sqoop can also export data back to the RDBMS for consumption by  other clients.

It is a command line tool with a set of tools and commands from which you can construct complex import and export scripts.

Within a SQL Server context it is analogous to BCP. 

Why is it important
  • It allows existing RDBMS data to be loaded into an HDFS cluster
  • Many existing analytic tools cannot address HDFS directly so Big Data analysts can be required to export data back to traditional data platforms for analysis with existing tools

The following blog posts are focused on using Sqoop with SQL Server data. Most online tutorials focus on MySQL. When learning Sqoop I found it difficult to find examples for SQL Server. In fact it took days and many grey hairs figuring out how to connect to a named instance. 

I use the Cloudera Quickstart VM which is easily hosted in Oracle VIrtualBox.

Posts in this series include;
Resources
Sqoop User Guide
Sqoop Wiki
Cloudera Quickstart VM
Microsoft JDBC Drivers
WinSCP
O'Reilly Publishing Sqoop Cookbook
Cloudera Community - for submitting questions and searching existing answers

For convenience, I've supplied T-SQL scripts to create and populate a database I'll be using throughout these examples.

Create Database
USE [master]
GO

CREATE DATABASE [Utility]
 CONTAINMENT = NONE
 ON PRIMARY
( NAME = N'Utility', FILENAME = N'C:\Data\Utility.mdf' , SIZE = 65536KB , MAXSIZE = UNLIMITED, FILEGROWTH = 65536KB )
 LOG ON
( NAME = N'Utility_log', FILENAME = N'C:\Logs\Utility_log.ldf' , SIZE = 8192KB , MAXSIZE = UNLIMITED , FILEGROWTH = 65536)
GO

Create Login
USE [master]
GO

CREATE LOGIN [ClouderaAccess] WITH PASSWORD=N'GiveMeData', DEFAULT_DATABASE=[master], DEFAULT_LANGUAGE=[us_english], CHECK_EXPIRATION=OFF, CHECK_POLICY=OFF
GO

Create Table
USE [Utility]
GO

CREATE TABLE [dbo].[Customers]
(
      [CustomerID] [int] IDENTITY(1,1) NOT NULL,
      [FirstName] [varchar](50) NOT NULL,
[Surname] [varchar](50) NOT NULL,
[DateOfBirth] [date] NOT NULL,
[EmailAddress] [varchar](100) NULL,
[PostCode] [varchar](8) NULL,
[MarketingOptIn] [bit] NULL,
[AutoRenewalOptIn] [bit] NULL,
[CreatedDateTime] [datetime] NOT NULL,
[ModifiedDateTime] [datetime] NULL,
CONSTRAINT [PK_Customers] PRIMARY KEY CLUSTERED
([CustomerID] ASC) WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY]
) ON [PRIMARY]

GO

Populate Table - 10 test rows

USE [Utility]
GO
SET IDENTITY_INSERT [dbo].[Customers] ON

GO
INSERT [dbo].[Customers] ([CustomerID], [FirstName], [Surname], [DateOfBirth], [EmailAddress], [PostCode], [MarketingOptIn], [AutoRenewalOptIn], [CreatedDateTime], [ModifiedDateTime]) VALUES (1, N'G', N'THOMAS', CAST(N'1974-07-12' AS Date), N'G.THOMAS@ntlworld.com', N'RM6 5TT', 0, 1, CAST(N'2010-06-16 00:00:00.000' AS DateTime), CAST(N'2010-06-16 00:00:00.000' AS DateTime))
GO
INSERT [dbo].[Customers] ([CustomerID], [FirstName], [Surname], [DateOfBirth], [EmailAddress], [PostCode], [MarketingOptIn], [AutoRenewalOptIn], [CreatedDateTime], [ModifiedDateTime]) VALUES (2, N'STEPHANIE', N'CARTER', CAST(N'1968-04-17' AS Date), N'S.CARTER@hotmail.co.uk', N'GU21 3JY', 0, 0, CAST(N'2014-07-11 00:00:00.000' AS DateTime), CAST(N'2014-07-11 00:00:00.000' AS DateTime))
GO
INSERT [dbo].[Customers] ([CustomerID], [FirstName], [Surname], [DateOfBirth], [EmailAddress], [PostCode], [MarketingOptIn], [AutoRenewalOptIn], [CreatedDateTime], [ModifiedDateTime]) VALUES (3, N'COLIN', N'BLACKWOOD', CAST(N'1979-10-19' AS Date), N'C.BLACKWOOD@googlemail.com', N'SE3 7JY ', 0, 1, CAST(N'2011-11-27 00:00:00.000' AS DateTime), CAST(N'2011-11-27 00:00:00.000' AS DateTime))
GO
INSERT [dbo].[Customers] ([CustomerID], [FirstName], [Surname], [DateOfBirth], [EmailAddress], [PostCode], [MarketingOptIn], [AutoRenewalOptIn], [CreatedDateTime], [ModifiedDateTime]) VALUES (4, N'L', N'BLACKWOOD', CAST(N'1996-01-01' AS Date), N'L.BLACKWOOD@inbox.com', N'SE3 7JY ', 0, 0, CAST(N'2011-11-27 00:00:00.000' AS DateTime), CAST(N'2011-11-27 00:00:00.000' AS DateTime))
GO
INSERT [dbo].[Customers] ([CustomerID], [FirstName], [Surname], [DateOfBirth], [EmailAddress], [PostCode], [MarketingOptIn], [AutoRenewalOptIn], [CreatedDateTime], [ModifiedDateTime]) VALUES (5, N'DANIEL', N'SENNITT', CAST(N'1975-01-01' AS Date), N'D.SENNITT@virginmedia.com', N'RG4 6HL ', 0, 0, CAST(N'2012-01-01 00:00:00.000' AS DateTime), CAST(N'2012-01-01 00:00:00.000' AS DateTime))
GO
INSERT [dbo].[Customers] ([CustomerID], [FirstName], [Surname], [DateOfBirth], [EmailAddress], [PostCode], [MarketingOptIn], [AutoRenewalOptIn], [CreatedDateTime], [ModifiedDateTime]) VALUES (6, N'M', N'ROSSITER', CAST(N'1982-04-18' AS Date), N'M.ROSSITER@live.com', N'CH49 9AP', 0, 0, CAST(N'2011-02-04 00:00:00.000' AS DateTime), CAST(N'2011-02-04 00:00:00.000' AS DateTime))
GO
INSERT [dbo].[Customers] ([CustomerID], [FirstName], [Surname], [DateOfBirth], [EmailAddress], [PostCode], [MarketingOptIn], [AutoRenewalOptIn], [CreatedDateTime], [ModifiedDateTime]) VALUES (7, N'JULIE', N'FLEMING', CAST(N'1978-07-12' AS Date), N'J.FLEMING@mail.com', N'IV2 3TX ', 1, 0, CAST(N'2012-08-02 00:00:00.000' AS DateTime), CAST(N'2012-08-02 00:00:00.000' AS DateTime))
GO
INSERT [dbo].[Customers] ([CustomerID], [FirstName], [Surname], [DateOfBirth], [EmailAddress], [PostCode], [MarketingOptIn], [AutoRenewalOptIn], [CreatedDateTime], [ModifiedDateTime]) VALUES (8, N'MICHAEL', N'FLEMING', CAST(N'1934-02-10' AS Date), N'M.FLEMING@live.com', N'IV2 3TX ', 0, 0, CAST(N'2012-08-02 00:00:00.000' AS DateTime), CAST(N'2012-08-02 00:00:00.000' AS DateTime))
GO
INSERT [dbo].[Customers] ([CustomerID], [FirstName], [Surname], [DateOfBirth], [EmailAddress], [PostCode], [MarketingOptIn], [AutoRenewalOptIn], [CreatedDateTime], [ModifiedDateTime]) VALUES (9, N'H A', N'PATEL', CAST(N'1972-07-14' AS Date), N'H.PATEL@btinternet.com', N'G52 3LD', 0, 1, CAST(N'2011-02-24 00:00:00.000' AS DateTime), CAST(N'2011-02-24 00:00:00.000' AS DateTime))
GO
INSERT [dbo].[Customers] ([CustomerID], [FirstName], [Surname], [DateOfBirth], [EmailAddress], [PostCode], [MarketingOptIn], [AutoRenewalOptIn], [CreatedDateTime], [ModifiedDateTime]) VALUES (10, N'CHRISTOPHER', N'KEECH', CAST(N'1993-12-14' AS Date), N'C.KEECH@hotmail.co.uk', N'TS269LN ', 1, 0, CAST(N'2013-08-22 00:00:00.000' AS DateTime), CAST(N'2013-08-22 00:00:00.000' AS DateTime))
GO
INSERT [dbo].[Customers] ([CustomerID], [FirstName], [Surname], [DateOfBirth], [EmailAddress], [PostCode], [MarketingOptIn], [AutoRenewalOptIn], [CreatedDateTime], [ModifiedDateTime]) VALUES (11, N'HANNAH', N'KEECH-REEVES', CAST(N'1988-07-01' AS Date), N'H.KEECH-REEVES@yahoo.co.uk', N'TS269LN ', 0, 0, CAST(N'2013-08-22 00:00:00.000' AS DateTime), CAST(N'2013-08-22 00:00:00.000' AS DateTime))
GO
SET IDENTITY_INSERT [dbo].[Customers] OFF
GO


Detect Local Connection Properties
SELECT
  CONNECTIONPROPERTY('net_transport') AS net_transport,
  CONNECTIONPROPERTY('protocol_type') AS protocol_type,
  CONNECTIONPROPERTY('auth_scheme') AS auth_scheme,
  CONNECTIONPROPERTY('local_net_address') AS local_net_address,
  CONNECTIONPROPERTY('local_tcp_port') AS local_tcp_port,
  CONNECTIONPROPERTY('client_net_address') AS client_net_address




No comments:

Post a Comment