forked from vmware-archive/docs-hd-staging
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPXFInstallationandAdministration.html
932 lines (851 loc) · 89.1 KB
/
PXFInstallationandAdministration.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
<!doctype html>
<html>
<head>
<meta charset="utf-8">
<!-- Always force latest IE rendering engine or request Chrome Frame -->
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible">
<!-- REPLACE X WITH PRODUCT NAME -->
<title>PXF Installation and Administration | Pivotal Docs</title>
<!-- Local CSS stylesheets -->
<link href="/stylesheets/master.css" media="screen,print" rel="stylesheet" type="text/css" />
<link href="/stylesheets/breadcrumbs.css" media="screen,print" rel="stylesheet" type="text/css" />
<link href="/stylesheets/search.css" media="screen,print" rel="stylesheet" type="text/css" />
<link href="/stylesheets/portal-style.css" media="screen,print" rel="stylesheet" type="text/css" />
<link href="/stylesheets/printable.css" media="print" rel="stylesheet" type="text/css" />
<!-- Confluence HTML stylesheet -->
<link href="/stylesheets/site-conf.css" media="screen,print" rel="stylesheet" type="text/css" />
<!-- Left-navigation code -->
<!-- http://www.designchemical.com/lab/jquery-vertical-accordion-menu-plugin/examples/# -->
<link href="/stylesheets/dcaccordion.css" rel="stylesheet" type="text/css" />
<script src="http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js" type="text/javascript"></script>
<script src="/javascripts/jquery.cookie.js" type="text/javascript"></script>
<script src="/javascripts/jquery.hoverIntent.minified.js" type="text/javascript"></script>
<script src="/javascripts/jquery.dcjqaccordion.2.7.min.js" type="text/javascript"></script>
<script type="text/javascript">
$(document).ready(function($){
$('#accordion-1').dcAccordion({
eventType: 'click',
autoClose: true,
saveState: true,
disableLink: false,
speed: 'fast',
classActive: 'test',
showCount: false
});
});
</script>
<link href="/stylesheets/grey.css" rel="stylesheet" type="text/css" />
<!-- End left-navigation code -->
<script src="/javascripts/all.js" type="text/javascript"></script>
<link href='http://www.gopivotal.com/misc/favicon.ico' rel='shortcut icon'>
<script type="text/javascript">
if (window.location.host === 'docs.gopivotal.com') {
var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-39702075-1']);
_gaq.push(['_setDomainName', 'gopivotal.com']);
_gaq.push(['_trackPageview']);
(function() {
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
})();
}
</script>
</head>
<body class="pivotalcf pivotalcf_getstarted pivotalcf_getstarted_index">
<div class="viewport">
<div class="mobile-navigation--wrapper mobile-only">
<div class="navigation-drawer--container">
<div class="navigation-item-list">
<div class="navbar-link active">
<a href="http://gopivotal.com">
Home
<i class="icon-chevron-right pull-right"></i>
</a>
</div>
<div class="navbar-link">
<a href="http://gopivotal.com/paas">
PaaS
<i class="icon-chevron-right pull-right"></i>
</a>
</div>
<div class="navbar-link">
<a href="http://gopivotal.com/big-data">
Big Data
<i class="icon-chevron-right pull-right"></i>
</a>
</div>
<div class="navbar-link">
<a href="http://gopivotal.com/agile">
Agile
<i class="icon-chevron-right pull-right"></i>
</a>
</div>
<div class="navbar-link">
<a href="http://gopivotal.com/support">
Help & Support
<i class="icon-chevron-right pull-right"></i>
</a>
</div>
<div class="navbar-link">
<a href="http://gopivotal.com/products">
Products
<i class="icon-chevron-right pull-right"></i>
</a>
</div>
<div class="navbar-link">
<a href="http://gopivotal.com/solutions">
Solutions
<i class="icon-chevron-right pull-right"></i>
</a>
</div>
<div class="navbar-link">
<a href="http://gopivotal.com/partners">
Partners
<i class="icon-chevron-right pull-right"></i>
</a>
</div>
</div>
</div>
<div class="mobile-nav">
<div class="nav-icon js-open-nav-drawer">
<i class="icon-reorder"></i>
</div>
<div class="header-center-icon">
<a href="http://gopivotal.com">
<div class="icon icon-pivotal-logo-mobile"></div>
</a>
</div>
</div>
</div>
<div class='wrap'>
<script src="//use.typekit.net/clb0qji.js" type="text/javascript"></script>
<script type="text/javascript">
try {
Typekit.load();
} catch (e) {
}
</script>
<script type="text/javascript">
document.domain = "gopivotal.com";
</script>
<script type="text/javascript">
WebFontConfig = {
google: { families: [ 'Source+Sans+Pro:300italic,400italic,600italic,300,400,600:latin' ] }
};
(function() {
var wf = document.createElement('script');
wf.src = ('https:' == document.location.protocol ? 'https' : 'http') +
'://ajax.googleapis.com/ajax/libs/webfont/1/webfont.js';
wf.type = 'text/javascript';
wf.async = 'true';
var s = document.getElementsByTagName('script')[0];
s.parentNode.insertBefore(wf, s);
})(); </script>
<div id="search-dropdown-box">
<div class="search-dropdown--container js-search-dropdown">
<div class="container-fluid">
<div class="close-menu-large"><img src="http://www.gopivotal.com/sites/all/themes/gopo13/images/icon-close.png" /></div>
<div class="search-form--container">
<div class="form-search">
<div class='gcse-search'></div>
<script src="http://www.google.com/jsapi" type="text/javascript"></script>
<script src="/javascripts/cse.js" type="text/javascript"></script>
</div>
</div>
</div>
</div>
</div>
<header class="navbar desktop-only" id="nav">
<div class="navbar-inner">
<div class="container-fluid">
<div class="pivotal-logo--container">
<a class="pivotal-logo" href="http://gopivotal.com"><span></span></a>
</div>
<ul class="nav pull-right">
<li class="navbar-link">
<a href="http://www.gopivotal.com/paas" id="paas-nav-link">PaaS</a>
</li>
<li class="navbar-link">
<a href="http://www.gopivotal.com/big-data" id="big-data-nav-link">BIG DATA</a>
</li>
<li class="navbar-link">
<a href="http://www.gopivotal.com/agile" id="agile-nav-link">AGILE</a>
</li>
<li class="navbar-link">
<a href="http://www.gopivotal.com/oss" id="oss-nav-link">OSS</a>
</li>
<li class="nav-search">
<a class="js-search-input-open" id="click-to-search"><span></span></a>
</li>
</ul>
</div>
<a href="http://www.gopivotal.com/contact">
<img id="get-started" src="http://www.gopivotal.com/sites/all/themes/gopo13/images/get-started.png">
</a>
</div>
</header>
<div class="main-wrap">
<div class="container-fluid">
<!-- Google CSE Search Box -->
<div id='docs-search'>
<gcse:search></gcse:search>
</div>
<div id='all-docs-link'>
<a href="http://docs.gopivotal.com/">All Documentation</a>
</div>
<div class="container">
<div id="sub-nav" class="nav-container">
<!-- Collapsible left-navigation-->
<ul class="accordion" id="accordion-1">
<!-- REPLACE <li/> NODES-->
<li>
<a href="index.html">Home</a></br>
<li>
<a href="PivotalHD.html">Pivotal HD 2.0.1</a>
<ul>
<li>
<a href="PHDEnterprise2.0.1ReleaseNotes.html">PHD Enterprise 2.0.1 Release Notes</a>
</li>
</ul>
<ul>
<li>
<a href="PHDInstallationandAdministration.html">PHD Installation and Administration</a>
<ul>
<li>
<a href="OverviewofPHD.html">Overview of PHD</a>
</li>
</ul>
<ul>
<li>
<a href="InstallationOverview.html">Installation Overview</a>
</li>
</ul>
<ul>
<li>
<a href="PHDInstallationChecklist.html">PHD Installation Checklist</a>
</li>
</ul>
<ul>
<li>
<a href="InstallingPHDUsingtheCLI.html">Installing PHD Using the CLI</a>
</li>
</ul>
<ul>
<li>
<a href="UpgradeChecklist.html">Upgrade Checklist</a>
</li>
</ul>
<ul>
<li>
<a href="UpgradingPHDUsingtheCLI.html">Upgrading PHD Using the CLI</a>
</li>
</ul>
<ul>
<li>
<a href="AdministeringPHDUsingtheCLI.html">Administering PHD Using the CLI</a>
</li>
</ul>
<ul>
<li>
<a href="PHDFAQFrequentlyAskedQuestions.html">PHD FAQ (Frequently Asked Questions)</a>
</li>
</ul>
<ul>
<li>
<a href="PHDTroubleshooting.html">PHD Troubleshooting</a>
</li>
</ul>
</li>
</ul>
<ul>
<li>
<a href="StackandToolsReference.html">Stack and Tools Reference</a>
<ul>
<li>
<a href="OverviewofApacheStackandPivotalComponents.html">Overview of Apache Stack and Pivotal Components</a>
</li>
</ul>
<ul>
<li>
<a href="ManuallyInstallingPivotalHD2.0Stack.html">Manually Installing Pivotal HD 2.0 Stack</a>
</li>
</ul>
<ul>
<li>
<a href="ManuallyUpgradingPivotalHDStackfrom1.1.1to2.0.html">Manually Upgrading Pivotal HD Stack from 1.1.1 to 2.0</a>
</li>
</ul>
<ul>
<li>
<a href="PivotalHadoopEnhancements.html">Pivotal Hadoop Enhancements</a>
</li>
</ul>
<ul>
<li>
<a href="Security.html">Security</a>
</li>
</ul>
</li>
</ul>
</li>
<li>
<a href="PivotalCommandCenter.html">Pivotal Command Center 2.2.1</a>
<ul>
<li>
<a href="PCC2.2.1ReleaseNotes.html">PCC 2.2.1 Release Notes</a>
</li>
</ul>
<ul>
<li>
<a href="PCCUserGuide.html">PCC User Guide</a>
<ul>
<li>
<a href="PCCOverview.html">PCC Overview</a>
</li>
</ul>
<ul>
<li>
<a href="PCCInstallationChecklist.html">PCC Installation Checklist</a>
</li>
</ul>
<ul>
<li>
<a href="InstallingPCC.html">Installing PCC</a>
</li>
</ul>
<ul>
<li>
<a href="UsingPCC.html">Using PCC</a>
</li>
</ul>
<ul>
<li>
<a href="CreatingaYUMEPELRepository.html">Creating a YUM EPEL Repository</a>
</li>
</ul>
<ul>
<li>
<a href="CommandLineReference.html">Command Line Reference</a>
</li>
</ul>
</li>
</ul>
</li>
<li>
<a href="PivotalHAWQ.html">Pivotal HAWQ 1.2.0</a>
<ul>
<li>
<a href="HAWQ1.2.0.1ReleaseNotes.html">HAWQ 1.2.0.1 Release Notes</a>
</li>
</ul>
<ul>
<li>
<a href="HAWQInstallationandUpgrade.html">HAWQ Installation and Upgrade</a>
<ul>
<li>
<a href="PreparingtoInstallHAWQ.html">Preparing to Install HAWQ</a>
</li>
</ul>
<ul>
<li>
<a href="InstallingHAWQ.html">Installing HAWQ</a>
</li>
</ul>
<ul>
<li>
<a href="InstallingtheHAWQComponents.html">Installing the HAWQ Components</a>
</li>
</ul>
<ul>
<li>
<a href="UpgradingHAWQandComponents.html">Upgrading HAWQ and Components</a>
</li>
</ul>
<ul>
<li>
<a href="HAWQConfigurationParameterReference.html">HAWQ Configuration Parameter Reference</a>
</li>
</ul>
</li>
</ul>
<ul>
<li>
<a href="HAWQAdministration.html">HAWQ Administration</a>
<ul>
<li>
<a href="HAWQOverview.html">HAWQ Overview</a>
</li>
</ul>
<ul>
<li>
<a href="HAWQQueryProcessing.html">HAWQ Query Processing</a>
</li>
</ul>
<ul>
<li>
<a href="UsingHAWQtoQueryData.html">Using HAWQ to Query Data</a>
</li>
</ul>
<ul>
<li>
<a href="ConfiguringClientAuthentication.html">Configuring Client Authentication</a>
</li>
</ul>
<ul>
<li>
<a href="KerberosAuthentication.html">Kerberos Authentication</a>
</li>
</ul>
<ul>
<li>
<a href="ExpandingtheHAWQSystem.html">Expanding the HAWQ System</a>
</li>
</ul>
<ul>
<li>
<a href="HAWQInputFormatforMapReduce.html">HAWQ InputFormat for MapReduce</a>
</li>
</ul>
<ul>
<li>
<a href="HAWQFilespacesandHighAvailabilityEnabledHDFS.html">HAWQ Filespaces and High Availability Enabled HDFS</a>
</li>
</ul>
<ul>
<li>
<a href="SQLCommandReference.html">SQL Command Reference</a>
</li>
</ul>
<ul>
<li>
<a href="ManagementUtilityReference.html">Management Utility Reference</a>
</li>
</ul>
<ul>
<li>
<a href="ClientUtilityReference.html">Client Utility Reference</a>
</li>
</ul>
<ul>
<li>
<a href="HAWQServerConfigurationParameters.html">HAWQ Server Configuration Parameters</a>
</li>
</ul>
<ul>
<li>
<a href="HAWQEnvironmentVariables.html">HAWQ Environment Variables</a>
</li>
</ul>
<ul>
<li>
<a href="HAWQDataTypes.html">HAWQ Data Types</a>
</li>
</ul>
<ul>
<li>
<a href="SystemCatalogReference.html">System Catalog Reference</a>
</li>
</ul>
<ul>
<li>
<a href="hawq_toolkitReference.html">hawq_toolkit Reference</a>
</li>
</ul>
</li>
</ul>
<ul>
<li>
<a href="PivotalExtensionFrameworkPXF.html">Pivotal Extension Framework (PXF)</a>
<ul>
<li>
<a href="PXFInstallationandAdministration.html">PXF Installation and Administration</a>
</li>
</ul>
<ul>
<li>
<a href="PXFExternalTableandAPIReference.html">PXF External Table and API Reference</a>
</li>
</ul>
</div><!--end of sub-nav-->
<h3 class="title-container">PXF Installation and Administration</h3>
<div class="content">
<!-- Python script replaces main content -->
<div id ="main"><div style="visibility:hidden; height:2px;">Pivotal Product Documentation : PXF Installation and Administration</div><div class="wiki-content group" id="main-content">
<p align="LEFT">PXF is an extensible framework that allows HAWQ to query external system data. PXF includes built-in connectors for accessing data that exists inside HDFS files, Hive tables, and HBase tables. Users can also create their own connectors to other parallel data stores or processing engines. To create these connectors using <span style="line-height: 1.4285;">JAVA plugins, see <a href="PXFExternalTableandAPIReference.html">PXF External Table and API Reference</a></span><span style="line-height: 1.4285;">.</span><span style="line-height: 1.4285;"> </span></p><p align="LEFT"><span style="line-height: 1.4285;"><style type="text/css">/*<![CDATA[*/
div.rbtoc1400035796851 {padding: 0px;}
div.rbtoc1400035796851 ul {list-style: disc;margin-left: 0px;}
div.rbtoc1400035796851 li {margin-left: 0px;padding-left: 0px;}
/*]]>*/</style><div class="toc-macro rbtoc1400035796851">
<ul class="toc-indentation">
<li><a href="#PXFInstallationandAdministration-Prerequisites">Prerequisites</a></li>
<li><a href="#PXFInstallationandAdministration-UpgradingfromGPXF">Upgrading from GPXF </a></li>
<li><a href="#PXFInstallationandAdministration-InstallingPXF">Installing PXF</a>
<ul class="toc-indentation">
<li><a href="#PXFInstallationandAdministration-InstallingthePXFFrameworkJARsFiles">Installing the PXF Framework JARs Files</a></li>
<li><a href="#PXFInstallationandAdministration-SettinguptheJavaClasspath">Setting up the Java Classpath</a></li>
<li><a href="#PXFInstallationandAdministration-EnablingtheRESTService">Enabling the REST Service </a></li>
<li><a href="#PXFInstallationandAdministration-RestartingtheCluster">Restarting the Cluster </a></li>
</ul>
</li>
<li><a href="#PXFInstallationandAdministration-SecurePXF">Secure PXF</a>
<ul class="toc-indentation">
<li><a href="#PXFInstallationandAdministration-Requirements">Requirements</a></li>
</ul>
</li>
<li><a href="#PXFInstallationandAdministration-ReadingandWritingDatawithPXF">Reading and Writing Data with PXF</a>
<ul class="toc-indentation">
<li><a href="#PXFInstallationandAdministration-Built-inProfiles">Built-in Profiles</a></li>
</ul>
</li>
<li><a href="#PXFInstallationandAdministration-AccessingHDFSFileDatawithPXF">Accessing HDFS File Data with PXF </a>
<ul class="toc-indentation">
<li><a href="#PXFInstallationandAdministration-InstallingthePXFHDFSplugin">Installing the PXF HDFS plugin</a></li>
<li><a href="#PXFInstallationandAdministration-Syntax">Syntax</a></li>
<li><a href="#PXFInstallationandAdministration-FORMATclause">FORMAT clause</a></li>
<li><a href="#PXFInstallationandAdministration-Fragmenter">Fragmenter</a></li>
<li><a href="#PXFInstallationandAdministration-Accessor">Accessor</a></li>
<li><a href="#PXFInstallationandAdministration-Resolver">Resolver</a></li>
<li><a href="#PXFInstallationandAdministration-AdditionalOptions">Additional Options</a></li>
<li><a href="#PXFInstallationandAdministration-AccessingdataonaHighAvailabilityHDFScluster">Accessing data on a High Availability HDFS cluster</a></li>
<li><a href="#PXFInstallationandAdministration-Recordkeyinkey-valuefileformats">Record key in key-value file formats</a></li>
<li><a href="#PXFInstallationandAdministration-Example">Example</a></li>
</ul>
</li>
<li><a href="#PXFInstallationandAdministration-CustomizedWritableSchemaFileGuidelines">Customized Writable Schema File Guidelines</a></li>
<li><a href="#PXFInstallationandAdministration-AccessingHiveDatawithPXF">Accessing Hive Data with PXF</a>
<ul class="toc-indentation">
<li><a href="#PXFInstallationandAdministration-InstallingthePXFHIVEplugin">Installing the PXF HIVE plugin</a></li>
<li><a href="#PXFInstallationandAdministration-Syntax.1">Syntax</a></li>
<li><a href="#PXFInstallationandAdministration-HiveCommandLine">Hive Command Line</a></li>
<li><a href="#PXFInstallationandAdministration-MappingHiveCollectionTypes">Mapping Hive Collection Types</a></li>
<li><a href="#PXFInstallationandAdministration-PartitionFiltering">Partition Filtering</a></li>
</ul>
</li>
<li><a href="#PXFInstallationandAdministration-AccessingHBaseDatawithPXF">Accessing HBase Data with PXF</a>
<ul class="toc-indentation">
<li><a href="#PXFInstallationandAdministration-InstallingthePXFHBaseplugin">Installing the PXF HBase plugin</a></li>
<li><a href="#PXFInstallationandAdministration-Syntax.2">Syntax</a></li>
</ul>
</li>
<li><a href="#PXFInstallationandAdministration-ColumnMapping">Column Mapping</a></li>
<li><a href="#PXFInstallationandAdministration-RowKey">Row Key</a>
<ul class="toc-indentation">
<li><a href="#PXFInstallationandAdministration-DirectMapping">Direct Mapping</a></li>
<li><a href="#PXFInstallationandAdministration-IndirectMapping(viaLookupTable)">Indirect Mapping (via Lookup Table)</a></li>
</ul>
</li>
<li><a href="#PXFInstallationandAdministration-AccessingGemFireDatawithPXF">Accessing GemFire Data with PXF</a>
<ul class="toc-indentation">
<li><a href="#PXFInstallationandAdministration-Syntax.3">Syntax</a></li>
</ul>
</li>
<li><a href="#PXFInstallationandAdministration-Troubleshooting">Troubleshooting</a></li>
</ul>
</div><br/></span></p><h2 id="PXFInstallationandAdministration-Prerequisites"><span style="line-height: 1.4285;">Prerequisites</span></h2><p>Check that the following systems are installed and running before you install PXF:</p><ul><li>HAWQ</li><li>Pivotal Hadoop (PHD)</li><li>Hadoop File System (HDFS) with REST service enabled on the Namenode and all the Datanodes.</li><li>Hive: Check that the Hive Metastore service is available and running. This is especially important on clusters where you have set the property <em>hive.matastore.uri</em> in the <em>hive-site.xml</em> file on the Namenode to point to that cluster.</li><li><p>HBase</p></li><li>When configuring Secure (Kerberized) HDFS, NameNode port must be 8020 for PXF to function. This limitation will be removed in the next release.</li></ul><h2 id="PXFInstallationandAdministration-UpgradingfromGPXF">Upgrading from GPXF<span style="font-size: medium;"> </span></h2><p>If you have a previous version of HAWQ, or GPXF installed, see the Pivotal ADS <span style="color: rgb(34,34,34);">1.2.0.0</span> Release Notes, for instructions about how to upgrade to HAWQ 1.2.x.</p><p>Please note the following:</p><ol><li>DROP the tables created using GPXF and CREATE them again using PXF.</li><li>When you CREATE tables for PXF, remember to perform the following:<ol><li style="margin-left: 30.0px;">Change the protocol name in the LOCATION clause from gpxf to pxf.</li><li style="margin-left: 30.0px;">Ensure that a Profile or Fragmenter, Accessor, Resolver are <em>always</em> specified for the table.</li><li style="margin-left: 30.0px;">If not using a Profile, Check that you have the new names for the Fragmenter, Accessor, Resolver classes. See <a href="PXFExternalTableandAPIReference.html">PXF External Table and API Reference</a> for more information about the API.</li></ol></li><li>Check that you are using the correct gucs for PXF:</li></ol><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">gpxf_enable_filter_pushdown -> pxf_enable_filter_pushdown
gpxf_enable_stat_collection -> pxf_enable_stat_collection
gpxf_enable_locality_optimizations -> pxf_enable_locality_optimizations</pre>
</div></div><h2 id="PXFInstallationandAdministration-InstallingPXF">Installing PXF</h2> <div class="aui-message hint shadowed information-macro">
<p class="title">Note</p>
<span class="aui-icon icon-hint">Icon</span>
<div class="message-content">
<p>This topic describes how you can install PXF as a separate component, if you did not install it using the Pivotal Hadoop (HD) Enterprise Command Line Interface.</p>
</div>
</div>
<p align="LEFT">This topic contains the following information:</p><ul><li>Installing the PXF JARs File</li><li>Setting up the Java Classpath</li><li>Enabling REST service</li><li>Restarting the Hadoop Cluster</li><li>Testing PXF with HDFS Files</li><li>Installing PXF to Work with Hive</li><li>Testing Hive and PXF Integration</li></ul><p align="LEFT">The following steps are required to install and configure PXF on a PHD 2.0.0 cluster.</p> <div class="aui-message hint shadowed information-macro">
<p class="title">Notes</p>
<span class="aui-icon icon-hint">Icon</span>
<div class="message-content">
<ul><li>HBase steps are only required for PXF using HBase.</li><li>Hive steps are only required for PXF for Hive.</li><li>All steps must be performed on all nodes, unless noted differently.</li></ul>
</div>
</div>
<h3 id="PXFInstallationandAdministration-InstallingthePXFFrameworkJARsFiles">Installing the PXF Framework JARs Files</h3><p align="LEFT">Install the PXF Framework JARs files on all nodes in the cluster:</p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: bash; gutter: false" style="font-size:12px;">sudo rpm -i pxf-core-2.2.0-x.rpm
sudo rpm -i pxf-api-2.2.0-x.rpm</pre>
</div></div><ul><li class="Code">PXF RPMs reside in the Pivotal ADS/HAWQ stack file. </li><li>The script installs the JARs files at the default location at <em>/usr/lib/gphd/pxf-2.2.0</em>. Two S<em>oftlinks:</em> pxf-core.jar and pxf-api.jar will be created in <em>/usr/lib/gphd/pxf</em></li></ul><h3 id="PXFInstallationandAdministration-SettinguptheJavaClasspath">Setting up the Java Classpath</h3><p align="LEFT">Append the following lines to the <em>/etc/gphd/hadoop/conf/hadoop-env.sh</em> on all nodes in the cluster:</p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: bash; gutter: false" style="font-size:12px;">export GPHD_ROOT=/usr/lib/gphd
export HADOOP_CLASSPATH=\
$GPHD_ROOT/pxf/pxf-core.jar:\
$GPHD_ROOT/pxf/pxf-api.jar:\
$GPHD_ROOT/publicstage:\
$GPHD_ROOT/zookeeper/zookeeper-3.4.5-gphd-3.0.0.0.jar:\
$GPHD_ROOT/hbase/lib/hbase-common-0.96.0-hadoop2-gphd-3.0.0.0.jar:\
$GPHD_ROOT/hbase/lib/hbase-protocol-0.96.0-hadoop2-gphd-3.0.0.0.jar:\
$GPHD_ROOT/hbase/lib/hbase-client-0.96.0-hadoop2-gphd-3.0.0.0.jar:\
$GPHD_ROOT/hbase/lib/hbase-thrift-0.96.0-hadoop2-gphd-3.0.0.0.jar:\
$GPHD_ROOT/hbase/lib/htrace-core-2.01.jar:\
/etc/gphd/hbase/conf:\
$GPHD_ROOT/hive/lib/hive-service-0.12.0-gphd-3.0.0.0.jar:\
$GPHD_ROOT/hive/lib/hive-metastore-0.12.0-gphd-3.0.0.0.jar:\
$GPHD_ROOT/hive/lib/hive-common-0.12.0-gphd-3.0.0.0.jar:\
$GPHD_ROOT/hive/lib/hive-exec-0.12.0-gphd-3.0.0.0.jar:\
$GPHD_ROOT/hive/lib/libfb303-0.9.0.jar:\
$GPHD_ROOT/hive/lib/libthrift-0.9.0.jar:\
/etc/gphd/hive/conf:\</pre>
</div></div><p align="LEFT">This adds the following to the PHD 2.0.0 Java classpath:</p><ul><li>PXF JARs, and staging dir <em>/usr/lib/gphd/publicstage</em> (see<em> "About the Public Directory" </em>below).</li><li>HBase: hbase-common, hbase-protocol, hbase-client, hbase-thrift, htrace-core, configuration dir</li><li>Zookeeper jar.</li><li>Hive: <span style="line-height: 1.4285;">hive-service, </span> <span style="line-height: 1.4285;">libthrift, </span> <span style="line-height: 1.4285;">hive-metastore, </span> <span style="line-height: 1.4285;">libfb303, </span> <span style="line-height: 1.4285;">hive-common, <span style="line-height: 1.4285;">hive-exec.</span></span></li></ul><h3 id="PXFInstallationandAdministration-EnablingtheRESTService">Enabling the REST Service </h3><p><span style="font-size: medium;"> </span>To enable REST service, edit the<em> /etc/gphd/hadoop/conf/hdfs-site.xml</em> files on all the nodes:</p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: html/xml; gutter: false" style="font-size:12px;"><property>
<name>dfs.webhdfs.enabled</name>
<value>true</value>
</property></pre>
</div></div><h3 id="PXFInstallationandAdministration-RestartingtheCluster"><a name="RTF37323830363a204865616432" rel="nofollow"></a>Restarting the Cluster </h3><ol><li>From the Admin node, use the Pivotal Hadoop (HD) Manager Command Line Interface to start the cluster.</li><li>Optionally, you can examine the classpath with the following command:</li></ol><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: bash; gutter: false" style="font-size:12px;">massh /tmp/clientnodes verbose "hadoop classpath"</pre>
</div></div><h2 id="PXFInstallationandAdministration-SecurePXF"><span style="color: rgb(0,0,0);">Secure PXF</span></h2><p>PXF can be used on a secure HDFS cluster. Read, write and analyze of PXF tables on HDFS files are enabled. No changes are required to PXF tables that pre-exist from a previous version.</p><h3 id="PXFInstallationandAdministration-Requirements">Requirements</h3><ul><li>Both HDFS and YARN principals are created and are properly configured.</li><li>HDFS uses port 8020.</li><li>HAWQ is configured to work in secure mode properly, according to the instructions in the HAWQ documentation.</li></ul>
<div class="aui-message warning shadowed information-macro">
<p class="title">Notes</p>
<span class="aui-icon icon-warning">Icon</span>
<div class="message-content">
<p>The HDFS Namenode port must be 8020. This is a limitation that will be fixed in the next PXF version.</p><p><span style="color: rgb(0,0,0);">Please refer to the troubleshooting section for common errors related to PXF security, and their meaning.</span></p>
</div>
</div>
<h2 id="PXFInstallationandAdministration-ReadingandWritingDatawithPXF"><span style="color: rgb(0,0,0);line-height: 1.25;font-size: 24.0px;">Reading and Writing Data with PXF</span></h2><p>PXF comes with a number of built-in connectors for reading data that exists inside HDFS files, Hive tables, and HBase tables and for writing data into HDFS files. These built-in connectors use the PXF extensible API. You can also use the extensible API to create your own connectors to any other type of parallel data store or processing engine. See <a href="PXFExternalTableandAPIReference.html">PXF External Table and API Reference</a> for more information about the API.</p><p>This topic contains the following information:</p><ul><li>Accessing HDFS File Data with PXF (Read + Write)</li><li>Accessing HIVE Data with PXF (Read only)</li><li>Accessing HBase Data with PXF (Read only)</li><li>Accessing GemFireXD Data with PXF (Read only)</li></ul><h3 id="PXFInstallationandAdministration-Built-inProfiles">Built-in Profiles</h3><p><span>A profile is a collection of common metadata attributes. Use the convenient and simplified PXF syntax.</span> </p><p>PXF comes with a number of built-in profiles which group together a collection of metadata attributes to achieve a common goal:</p><div class="table-wrap"><table class="confluenceTable"><tbody><tr><th class="confluenceTh">Profile</th><th class="confluenceTh">Description</th><th class="confluenceTh" colspan="1">Fragmenter/Accessor/Resolver</th></tr><tr><td class="confluenceTd">HdfsTextSimple</td><td class="confluenceTd"><p>Read or write delimited single line records from or to plain text files on HDFS.</p></td><td class="confluenceTd" colspan="1"><ul><li><span>com.pivotal.pxf.plugins.hdfs.</span>HdfsDataFragmenter</li><li><span>com.pivotal.pxf.plugins.hdfs.</span>LineBreakAccessor</li><li><span>com.pivotal.pxf.plugins.hdfs.</span>StringPassResolver</li></ul></td></tr><tr><td class="confluenceTd">HdfsTextMulti</td><td class="confluenceTd"><p>Read delimited single or multi line records (with quoted linefeeds) from plain text files on <br/>HDFS.<br/> It is not splittable (non parallel) and therefore reading is slower than reading with <br/>HdfsTextSimple.</p></td><td class="confluenceTd" colspan="1"><ul><li><span>com.pivotal.pxf.plugins.hdfs.</span>HdfsDataFragmenter</li><li><span>com.pivotal.pxf.plugins.hdfs.</span>QuotedLineBreakAccessor</li><li><span>com.pivotal.pxf.plugins.hdfs.</span>StringPassResolver</li></ul></td></tr><tr><td class="confluenceTd">Hive</td><td class="confluenceTd">Use this when connected to Hive.</td><td class="confluenceTd" colspan="1"><ul><li><span>com.pivotal.pxf.plugins.hive.</span>HiveDataFragmenter</li><li><span>com.pivotal.pxf.plugins.hive.</span>HiveAccessor</li><li><span>com.pivotal.pxf.plugins.hive.</span>HiveResolver</li></ul></td></tr><tr><td class="confluenceTd">HBase</td><td class="confluenceTd">Use this when connected to an HBase data store engine.</td><td class="confluenceTd" colspan="1"><ul><li><span>com.pivotal.pxf.plugins.hbase.</span>HBaseDataFragmenter</li><li><span>com.pivotal.pxf.plugins.hbase.</span>HBaseAccessor</li><li><span>com.pivotal.pxf.plugins.hbase.</span>HBaseResolver</li></ul><p> </p></td></tr><tr><td class="confluenceTd">Avro</td><td class="confluenceTd">Reading Avro files (i.e fileName.avro).</td><td class="confluenceTd" colspan="1"><ul><li><span>com.pivotal.pxf.plugins.hdfs.</span>HdfsDataFragmenter</li><li><span>com.pivotal.pxf.plugins.hdfs.</span>AvroFileAccessor</li><li><span>com.pivotal.pxf.plugins.hdfs.</span>AvroResolver</li></ul></td></tr><tr><td class="confluenceTd">GemFireXD</td><td class="confluenceTd">Use this when connected to GemFireXD</td><td class="confluenceTd" colspan="1"><ul><li>com.pivotal.pxf.plugins.gemfirexd.GemFireXDFragmenter</li><li>com.pivotal.pxf.plugins.gemfirexd.GemFireXDAccessor</li></ul></td></tr></tbody></table></div><h4 id="PXFInstallationandAdministration-AddingandUpdatingProfiles">Adding and Updating Profiles</h4><p class="diff-block-target diff-block-context"><span style="color: rgb(51,51,51);">Administrators can add new profiles or edit the built-in profiles inside </span><span style="color: rgb(51,51,51);"><em>pxf-profiles.xm</em>l (and apply with the Pivotal Hadoop (HD) Enterprise Command Line Interface). You can use the all the profiles in <em>pxf-profiles.xml</em>.</span></p><p class="diff-block-target diff-block-context">Each profile has a mandatory unique name and an optional description.</p><p class="diff-block-target diff-block-context">In addition, each profile contains a set of plugins that are an extensible set of metadata attributes.</p><h4 class="diff-block-target diff-block-context" id="PXFInstallationandAdministration-CustomProfileExample">Custom Profile Example</h4><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: html/xml; gutter: false" style="font-size:12px;"><profile>
<name>MyCustomProfile</name>
<description>A Custom Profile Example</description>
<plugins>
<fragmenter>package.name.CustomProfileFragmenter</fragmenter>
<accessor>package.name.CustomProfileAccessor</accessor>
<customPlugin1>package.name.MyCustomPluginValue1</customPlugin1>
<customPlugin2>package.name.MyCustomPluginValue2</customPlugin2>
</plugins>
</profile></pre>
</div></div><h4 class="diff-block-target diff-block-context" id="PXFInstallationandAdministration-DeprecatedClassnames">Deprecated Classnames</h4><p class="diff-block-context">Before PXF 2.2.0, you could use connector class names without their package names. e.g.<em> HdfsDataFragmenter</em> instead of <em>com.pivotal.pxf.plugins.hdfs.HdfsDataFragmenter</em>. PXF 2.2.0 does <em>not</em> support old names and you will see an error message:</p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">WARNING: Use of HdfsDataFragmenter is deprecated and it will be removed on the next major version
DETAIL: Please use the appropriate PXF profile for forward compatibility (e.g. profile=HdfsTextSimple)</pre>
</div></div><p class="diff-block-target diff-block-context"> </p><p class="diff-block-target diff-block-context">Pivotal recommends that you use PXF Profiles as best practice.</p><h4 class="diff-block-target diff-block-context" id="PXFInstallationandAdministration-RecommendedBuilt-inPXFProfiles"><span>Recommended Built-in PXF Profiles </span></h4><div class="table-wrap"><table class="confluenceTable"><tbody><tr><th class="confluenceTh">Old name</th><th class="confluenceTh">PXF Profile</th></tr><tr><td class="confluenceTd" colspan="1">HdfsDataFragmenter, TextFileAccessor, TextResolver</td><td class="confluenceTd" colspan="1">HdfsTextSimple</td></tr><tr><td class="confluenceTd" colspan="1">HdfsDataFragmenter, QuotedLineBreakAccessor, TextResolver</td><td class="confluenceTd" colspan="1">HdfsTextMulti</td></tr><tr><td class="confluenceTd" colspan="1">HdfsDataFragmenter, AvroFileAccessor, AvroResolver</td><td class="confluenceTd" colspan="1">Avro</td></tr><tr><td class="confluenceTd" colspan="1">HdfsDataFragmenter, SequenceFileAccessor, CustomWritable</td><td class="confluenceTd" colspan="1">SequenceWritable</td></tr><tr><td class="confluenceTd" colspan="1">HBaseDataFragmenter, HBaseAccessor, HBaseResolver</td><td class="confluenceTd" colspan="1">HBase</td></tr><tr><td class="confluenceTd" colspan="1">HiveDataFragmenter, HiveAccessor, HiveResolver</td><td class="confluenceTd" colspan="1">Hive</td></tr></tbody></table></div><p class="diff-block-target diff-block-context">Here is a list of class names old and new:</p><div class="table-wrap"><table class="confluenceTable"><tbody><tr><th class="confluenceTh">Old Name</th><th class="confluenceTh">New Name</th></tr><tr><td class="confluenceTd" colspan="1">TextFileAccessor, LineBreakAccessor, LineReaderAccessor</td><td class="confluenceTd" colspan="1">com.pivotal.pxf.plugins.hdfs.LineBreakAccessor</td></tr><tr><td class="confluenceTd" colspan="1">QuotedLineBreakAccessor</td><td class="confluenceTd" colspan="1">com.pivotal.pxf.plugins.hdfs.QuotedLineBreakAccessor</td></tr><tr><td class="confluenceTd" colspan="1">AvroFileAccessor</td><td class="confluenceTd" colspan="1">com.pivotal.pxf.plugins.hdfs.AvroFileAccessor</td></tr><tr><td class="confluenceTd" colspan="1">SequenceFileAccessor</td><td class="confluenceTd" colspan="1">com.pivotal.pxf.plugins.hdfs.SequenceFileAccessor</td></tr><tr><td class="confluenceTd" colspan="1">TextResolver, StringPassResolver</td><td class="confluenceTd" colspan="1">com.pivotal.pxf.plugins.hdfs.StringPassResolver</td></tr><tr><td class="confluenceTd" colspan="1">AvroResolver</td><td class="confluenceTd" colspan="1">com.pivotal.pxf.plugins.hdfs.AvroResolver</td></tr><tr><td class="confluenceTd" colspan="1">WritableResolver</td><td class="confluenceTd" colspan="1">com.pivotal.pxf.plugins.hdfs.WritableResolver</td></tr><tr><td class="confluenceTd" colspan="1">HdfsDataFragmenter</td><td class="confluenceTd" colspan="1">com.pivotal.pxf.plugins.hdfs.HdfsDataFragmenter</td></tr></tbody></table></div><h2 class="diff-block-target diff-block-context" id="PXFInstallationandAdministration-AccessingHDFSFileDatawithPXF">Accessing HDFS File Data with PXF </h2><h3 id="PXFInstallationandAdministration-InstallingthePXFHDFSplugin">Installing the PXF HDFS plugin</h3><p>Install the PXF HDFS plugin jar file on all nodes in the cluster:</p><p> </p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: bash; gutter: false" style="font-size:12px;">sudo rpm -i pxf-hdfs-2.2.0-x.rpm</pre>
</div></div><ul><li class="Code">PXF RPMs reside in the Pivotal ADS/HAWQ stack file. </li><li>The script installs the JAR file at the default location at <em>/usr/lib/gphd/pxf-2.2.0</em>. The S<em>oftlink pxf-hdfs.jar</em> will be created in <em>/usr/lib/gphd/pxf</em></li></ul><p> </p> <div class="aui-message hint shadowed information-macro">
<p class="title">Notes</p>
<span class="aui-icon icon-hint">Icon</span>
<div class="message-content">
<ul><li>Pivotal recommends that you test PXF on HDFS before connecting to Hive or HBase.</li><li>PXF on secure HDFS clusters requires NameNode to be configured on port 8020</li></ul>
</div>
</div>
<p>The syntax for accessing an HDFS file is as follows: </p><h3 id="PXFInstallationandAdministration-Syntax">Syntax</h3><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: sql; gutter: false" style="font-size:12px;">CREATE [READABLE|WRITABLE] EXTERNAL TABLE <tbl name> (<attr list>)
LOCATION ('pxf://<name node hostname:50070>/<path to file or directory>?Profile=<chosen profile>[&<additional options>=<value>]')
FORMAT '[TEXT | CSV | CUSTOM]' (<formatting properties>)
[ [LOG ERRORS INTO <error_table>] SEGMENT REJECT LIMIT <count> [ROWS | PERCENT] ];
SELECT ... FROM <tbl name>; --to read from hdfs with READABLE table.
INSERT INTO <tbl name> ...; --to write to hdfs with WRITABLE table.</pre>
</div></div><p>To read the data in the files or to write based on the existing format, you need to select the FORMAT, Profile, or one of the classes.</p><p>This topic describes the following:</p><ul><li>FORMAT clause</li><li>Fragmenter</li><li>Accessor</li><li>Resolver</li></ul> <div class="aui-message hint shadowed information-macro">
<p class="title">Note</p>
<span class="aui-icon icon-hint">Icon</span>
<div class="message-content">
<p>For more details about the API and classes, see the <a href="PXFExternalTableandAPIReference.html">PXF External Table and API Reference</a>.</p>
</div>
</div>
<h3 id="PXFInstallationandAdministration-FORMATclause"><span style="line-height: 1.4285;">FORMAT clause</span></h3><p><span style="line-height: 1.4285;">To read data, use the following formats with any PXF connector:</span></p><ul><li><strong>FORMAT ‘TEXT’</strong>: Use with plain delimited text files on HDFS.</li><li><strong>FORMAT ‘CSV’</strong>: Use with comma-separated value files on HDFS.</li><li><strong>FORMAT ‘CUSTOM’</strong>: Use with other files, such as binary formats. Must always be used with built-in formatter ‘<em>pxfwritable_import</em>’ (for read) or '<em>pxfwritable_export</em>' (for write).</li></ul><h3 id="PXFInstallationandAdministration-Fragmenter">Fragmenter</h3><p>Always use either [<em>HdfsTextSimple | HdfsTextMulti]</em> Profile or an <span>com.pivotal.pxf.plugins.hdfs.</span><em>HdfsDataFragmenter </em>for HDFS file data<em>. </em></p> <div class="aui-message hint shadowed information-macro">
<p class="title">Note</p>
<span class="aui-icon icon-hint">Icon</span>
<div class="message-content">
<p>For read tables, you must include a Profile or a Fragmenter in the table definition.</p>
</div>
</div>
<h3 id="PXFInstallationandAdministration-Accessor">Accessor</h3><p>The choice of an Accessor depends on the HDFS data file type. </p><p><strong>Note:</strong> You must include a Profile or an Accessor in the table definition.</p><div class="table-wrap"><table class="confluenceTable"><tbody><tr><th class="confluenceTh"><div class="tablesorter-header-inner">File Type</div></th><th class="confluenceTh"><div class="tablesorter-header-inner">Accessor</div></th><th class="confluenceTh"><div class="tablesorter-header-inner">FORMAT clause</div></th><th class="confluenceTh" colspan="1"><div class="tablesorter-header-inner">Comments</div></th></tr><tr><td class="confluenceTd">Plain Text delimited</td><td class="confluenceTd"><span>com.pivotal.pxf.plugins.hdfs.</span>LineBreakAccessor</td><td class="confluenceTd">FORMAT 'TEXT' (<format param list>)</td><td class="confluenceTd" colspan="1"> Read + Write</td></tr><tr><td class="confluenceTd" rowspan="2">Plain Text CSV </td><td class="confluenceTd"><p><span>com.pivotal.pxf.plugins.hdfs.</span>LineBreakAccessor</p></td><td class="confluenceTd" rowspan="2">FORMAT 'CSV' (<format param list>) </td><td class="confluenceTd" colspan="1"><p>LineBreakAccessor is parallel and faster.</p><p>Use if each logical data row is a physical data line.</p><p>Read + Write </p></td></tr><tr><td class="confluenceTd" colspan="1"><span>com.pivotal.pxf.plugins.hdfs.</span>QuotedLineBreakAccessor</td><td class="confluenceTd" colspan="1"><p>QuotedLineBreakAccessor is slower and non parallel.</p><p>Use if the data includes embedded (quoted) linefeed <br/>characters.</p><p>Read Only </p></td></tr><tr><td class="confluenceTd">SequenceFile</td><td class="confluenceTd"><span>com.pivotal.pxf.plugins.hdfs.</span>SequenceFileAccessor</td><td class="confluenceTd">FORMAT 'CUSTOM' (formatter='pxfwritable_import')</td><td class="confluenceTd" colspan="1"> Read + Write (use formatter='pxfwritable_export' for write)</td></tr><tr><td class="confluenceTd" colspan="1">AvroFile</td><td class="confluenceTd" colspan="1"><span>com.pivotal.pxf.plugins.hdfs.</span>AvroFileAccessor</td><td class="confluenceTd" colspan="1">FORMAT 'CUSTOM' (formatter='pxfwritable_import')</td><td class="confluenceTd" colspan="1"> Read Only</td></tr></tbody></table></div><h3 id="PXFInstallationandAdministration-Resolver">Resolver</h3><p>Choose the Resolver format if data records are serialized in the HDFS file. </p><p><strong>Note</strong>: You must include a Profile or a Resolver in the table definition.</p><div class="table-wrap"><table class="confluenceTable"><tbody><tr><th class="confluenceTh"><div class="tablesorter-header-inner">Record Serialization</div></th><th class="confluenceTh"><div class="tablesorter-header-inner">Resolver</div></th><th class="confluenceTh" colspan="1"><div class="tablesorter-header-inner">Comments</div></th></tr><tr><td class="confluenceTd">Avro</td><td class="confluenceTd"><span>com.pivotal.pxf.plugins.hdfs.</span>AvroResolver</td><td class="confluenceTd" colspan="1"><ul><li>Avro files include the record schema, Avro serialization can be used in other file types (e.g, Sequence File). </li><li>For Avro serialized records outside an Avro file, include a schema file name (.avsc)<span> in the url under <br/>the optional </span><em>Schema-Data </em>option.</li><li><span>The schema file name must exist in the public stage directory.</span></li><li>Deserialize Only (Read)<span> </span></li></ul></td></tr><tr><td class="confluenceTd">Java Writable</td><td class="confluenceTd"><span>com.pivotal.pxf.plugins.hdfs.</span>WritableResolver</td><td class="confluenceTd" colspan="1"><ul><li>Include the name of the Java class that uses Writable serialization <span>in the url under the optional </span><em>Schema-Data.</em></li><li><span>The class file must exist in the public stage directory (or in Hadoop's class path).</span></li><li>Deserialize and Serialize (Read + Write)<span> </span></li><li><span><span>See </span></span><span style="line-height: 1.4285;">Customized Writable Schema File Guidelines</span></li></ul></td></tr><tr><td class="confluenceTd">None (plain text)</td><td class="confluenceTd"><span>com.pivotal.pxf.plugins.hdfs.</span>StringPassResolver</td><td class="confluenceTd" colspan="1"><ul><li>Does not serialize plain text records. The database parses plain records. Passes records as they are.</li><li><span>Deserialize and Serialize (Read + Write)</span></li></ul></td></tr></tbody></table></div><h3 id="PXFInstallationandAdministration-AdditionalOptions">Additional Options</h3><div class="table-wrap"><table class="confluenceTable"><tbody><tr><th class="confluenceTh"><div class="tablesorter-header-inner">Option Name</div></th><th class="confluenceTh"><div class="tablesorter-header-inner">Description</div></th></tr><tr><td class="confluenceTd"><span style="color: rgb(0,0,0);">COMPRESSION_CODEC</span></td><td class="confluenceTd"><ul><li>Useful for WRITABLE PXF tables.</li><li>Specifies the c<span style="color: rgb(0,0,0);">ompression codec class name for compressing the written data. The class must implement the org.apache.hadoop.io.compress.CompressionCodec <br/></span><span style="color: rgb(0,0,0);">interface. </span></li><li><span style="color: rgb(0,0,0);"> </span><span style="color: rgb(0,0,0);">Some valid values are</span><span style="color: rgb(0,0,0);"> </span><span style="color: rgb(0,0,0);">org.apache.hadoop.io.compress.DefaultCodec, org.apache.hadoop.io.compress.GzipCodec, org.apache.hadoop.io.compress.BZip2Codec.</span></li><li><span style="color: rgb(0,0,0);"><span style="color: rgb(0,0,0);"> Note: org.apache.hadoop.io.compress.BZip2Codec is runs in a single thread and can be slow.</span></span></li><li><span style="color: rgb(0,0,0);">This option has no default value. </span></li><li><span style="color: rgb(0,0,0);">When the option is not defined, no compression will be done.</span></li></ul></td></tr><tr><td class="confluenceTd" colspan="1">COMPRESSION_TYPE</td><td class="confluenceTd" colspan="1"><ul><li>Useful WRITABLE PXF tables with SequenceFileAccessor.</li><li>Ignored when COMPRESSION_CODEC is not defined.</li><li>Specifies the compression type for sequence file.</li><li>Valid options are: <ul><li>RECORD - only the value part of each row is compressed.</li><li>BLOCK - both keys and values are collected in 'blocks' separately and compressed.</li></ul></li><li>Default value: RECORD.</li></ul></td></tr><tr><td class="confluenceTd">SCHEMA-DATA</td><td class="confluenceTd"><p>The data schema file used to create and read the HDFS file. <span>For example, you could create an avsc (for Avro), or a java class <br/>(for Writable Serialization) file. Check that the file exists on the public </span><span>directory (see About the Public Directory).</span></p><p><span><span style="color: rgb(0,0,0);">This option has no default value.</span></span></p></td></tr><tr><td class="confluenceTd" colspan="1">THREAD-SAFE</td><td class="confluenceTd" colspan="1"><p>Determines if the table query can run in multithread mode or not. When set to FALSE requests will be handled in a single thread.</p><p>Should be set when a plugin or other element that are not thread safe are used (e.g. compression codec).</p><p>Allowed values: TRUE, FALSE. Default value is TRUE - requests can run in multithread mode.</p></td></tr><tr><td class="confluenceTd" colspan="1"> <custom></td><td class="confluenceTd" colspan="1"> Any option that is desired to add to the pxf URI string will be accepted and passed along with its value to the Fragmenter, <br/>Accessor, Analyzer and <span>Resolver implementations </span></td></tr></tbody></table></div><h3 id="PXFInstallationandAdministration-AccessingdataonaHighAvailabilityHDFScluster">Accessing data on a High Availability HDFS cluster</h3><p>To access data on a High Availability HDFS cluster, you need to change the authority in the URI in the LOCATION.<br/>Use <<strong>HA nameservice</strong>> instead of <<strong>name node host:50070</strong>>.</p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: sql; gutter: false" style="font-size:12px;">CREATE [READABLE|WRITABLE] EXTERNAL TABLE <tbl name> (<attr list>)
LOCATION ('pxf://<HA nameservice>/<path to file or directory>?Profile=<chosen profile>[&<additional options>=<value>]')
FORMAT '[TEXT | CSV | CUSTOM]' (<formatting properties>);</pre>
</div></div><p>You can access the data using SQL queries.</p><p> </p><p><span style="color: rgb(0,102,0);line-height: 1.5;font-size: 20.0px;">About the Public Directory</span></p><p align="LEFT">PXF provides a space to store your customized serializers and schema files on the filesystem. You must add schema files on all the datanodes and restart the cluster. The RPM creates the directory at the default location: <em>/usr/lib/gphd/publicstage</em>.</p><p align="LEFT">Ensure that all HDFS users have read permissions to HDFS services and limit write permissions to specific users.</p><h3 id="PXFInstallationandAdministration-Recordkeyinkey-valuefileformats">Record key in key-value file formats</h3><p>For sequence file and other file formats that store rows in a key-value format, the key value can be accessed through HAWQ using the saved keyword '<em>recordkey</em>' as a field name.</p><p>The field type must correspond to the key type, the same as the other fields must match the HDFS data. </p><p>WritableResolver supports read and write of recordkey, which can be of the following Writable hadoop types: <br/><span style="color: rgb(0,0,0);">BooleanWritable, ByteWritable, IntWritable, DoubleWritable, FloatWritable, LongWritable, Text.</span></p><p><span style="color: rgb(0,0,0);">If the <em>recordkey</em> field is not defined, the key is ignored in read, and a default value (segment id as LongWritable) is written in write.</span></p><h3 id="PXFInstallationandAdministration-Example"><span style="color: rgb(0,0,0);">Example</span></h3><p><span style="color: rgb(0,0,0);">Let's say we have a data schema Babies.class containing 3 fields: (name text, birthday text, weight float). </span></p><p><span style="color: rgb(0,0,0);">An external table must include these three fields, and can either include or ignore the recordkey.</span></p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: sql; gutter: false" style="font-size:12px;">-- writable table with recordkey
CREATE WRITABLE EXTERNAL TABLE babies_registry (recordkey int, name text, birthday text, weight float)
LOCATION ('pxf://namenode_host:50070/babies_1940s?ACCESSOR=com.pivotal.pxf.plugins.hdfs.SequenceFileAccessor&RESOLVER=com.pivotal.pxf.plugins.hdfs.WritableResolver&DATA-SCHEMA=Babies')
FORMAT 'CUSTOM' (formatter='pxfwritable_export');
INSERT INTO babies_registry VALUES (123456, "James Paul McCartney", "June 18, 1942", 3.800);
-- writable table without recordkey
CREATE WRITABLE EXTERNAL TABLE babies_registry2 (name text, birthday text, weight float)
LOCATION ('pxf://namenode_host:50070/babies_1940s?ACCESSOR=com.pivotal.pxf.plugins.SequenceFileAccessor&RESOLVER=com.pivotal.pxf.plugins.WritableResolver&DATA-SCHEMA=Babies')
FORMAT 'CUSTOM' (formatter='pxfwritable_export');
INSERT INTO babies_registry VALUES ("Richard Starkey", "July 7, 1940", 4.0); -- this record's key will have some default value</pre>
</div></div><p>The same goes for reading data from an existing file of a key-value format, e.g. a Sequence file.</p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: sql; gutter: false" style="font-size:12px;">-- readable table with recordkey
CREATE EXTERNAL TABLE babies_1940 (recordkey int, name text, birthday text, weight float)
LOCATION ('pxf://namenode_host:50070/babies_1940s?ACCESSOR=com.pivotal.pxf.plugins.hdfs.SequenceFileAccessor&RESOLVER=com.pivotal.pxf.plugins.hdfs.WritableResolver&DATA-SCHEMA=Babies')
FORMAT 'CUSTOM' (formatter='pxfwritable_import');
SELECT * FROM babies_1940; -- retrieves each record's key
-- readable table without recordkey
CREATE EXTERNAL TABLE babies_1940_2 (name text, birthday text, weight float)
LOCATION ('pxf://namenode_host:50070/babies_1940s?ACCESSOR=com.pivotal.pxf.plugins.hdfs.SequenceFileAccessor&RESOLVER=com.pivotal.pxf.plugins.hdfs.WritableResolver&DATA-SCHEMA=Babies')
FORMAT 'CUSTOM' (formatter='pxfwritable_import');
SELECT * FROM babies_1940_2; -- ignores the records' key</pre>
</div></div><h2 id="PXFInstallationandAdministration-CustomizedWritableSchemaFileGuidelines">Customized Writable Schema File Guidelines</h2><p><span style="color: rgb(0,0,0);">When using a WritableResolver, a schema file needs to be defined. The file needs to be a java class file and must be on the class path of PXF.</span></p><p><span style="color: rgb(0,0,0);line-height: 1.4285;">The class file must follow the following requirements:</span></p><ol><li><span style="color: rgb(0,0,0);">Must implement org.apache.hadoop.io.Writable interface. </span></li><li><span style="color: rgb(0,0,0);line-height: 1.4285;">WritableResolver uses reflection to recreate the schema and populate its fields (for both read and write). Then it uses the Writable interface functions to read/write.<br/></span><span style="color: rgb(0,0,0);line-height: 1.4285;">Therefore fields must be public to enable access to them. Private fields will be ignored.</span></li><li><span style="color: rgb(0,0,0);line-height: 1.4285;">Fields are accessed and populated by the order in which they are declared in the class file.</span></li><li><span style="color: rgb(0,0,0);line-height: 1.4285;">Supported field types:</span><br/><span style="color: rgb(0,0,0);line-height: 1.4285;">String, int, double, float, long, short, boolean, byte array.<br/></span><span style="color: rgb(0,0,0);line-height: 1.4285;">Array of any of the above types is supported, but the constructor must define the array size so the reflection will work.</span></li></ol><h2 id="PXFInstallationandAdministration-AccessingHiveDatawithPXF">Accessing Hive Data with PXF</h2><h3 id="PXFInstallationandAdministration-InstallingthePXFHIVEplugin">Installing the PXF HIVE plugin</h3><p>Install the PXF HIVE plugin on all nodes in the cluster:<span style="line-height: 1.4285;font-size: 14.0px;background-color: transparent;"> </span></p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: bash; gutter: false" style="font-size:12px;">sudo rpm -i pxf-hive-2.2.0-x.rpm</pre>
</div></div><ul><li class="Code">PXF RPMs reside in the Pivotal ADS/HAWQ stack file. </li><li>The script installs the JAR file at the default location at <em>/usr/lib/gphd/pxf-2.2.0</em>. The S<em>oftlink pxf-hive.jar</em> will be created in <em>/usr/lib/gphd/pxf</em><br/><em><br/></em></li></ul> <div class="aui-message hint shadowed information-macro">
<p class="title">PXF HIVE Prerequisites</p>
<span class="aui-icon icon-hint">Icon</span>
<div class="message-content">
<p>Check the following before <span style="color: rgb(51,51,51);">adding PXF support on Hive:</span></p><ul><li><span style="color: rgb(51,51,51);">PXF HDFS plugin is installed on the cluster nodes.</span></li><li><span style="color: rgb(51,51,51);">You are running the Hive Metastore service on a machine in your cluster. </span></li><li><span style="color: rgb(51,51,51);">Check that you have set the <em>hive.metastore.uris property</em> in the <em>hive-site.xml</em> on the Namenode.</span></li><li><span style="color: rgb(51,51,51);">The Hive JAR files are installed on the cluster nodes.</span></li></ul><p><span style="color: rgb(51,51,51);">See Setting up the Java Classpath.</span></p>
</div>
</div>
<h3 id="PXFInstallationandAdministration-Syntax.1">Syntax</h3><p>Hive tables are always defined in a specific way in PXF, regardless of the underlying file storage format. The PXF Hive plugins automatically detect source tables:</p><ul><li>Text based</li><li>SequenceFile</li><li>RCFile</li><li>ORCFile</li></ul><p>The source table can also be a combination of these types. The PXF Hive plugin uses this information to query the data in runtime. The following PXF table definition is valid for any file storage type.</p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: sql; gutter: false" style="font-size:12px;">CREATE EXTERNAL TABLE hivetest(id int, newid int)
LOCATION ('pxf://<NN host>:50070/<hive table name>?PROFILE=Hive')
FORMAT 'custom' (formatter='pxfwritable_import');
SELECT * FROM hivetest;</pre>
</div></div><p><span style="line-height: 1.4285;"> </span> <strong>Note</strong> <span>: 50070 as noted in the example above is the REST server port on the HDFS NameNode. If a different port is assigned in your installation, use that port.</span></p><h3 id="PXFInstallationandAdministration-HiveCommandLine">Hive Command Line</h3><p><span style="line-height: 1.4285;">To start the Hive command line and work directly on a Hive table:</span></p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: powershell; gutter: false" style="font-size:12px;">/>${HIVE_HOME}/bin/hive</pre>
</div></div><p>Here's an example of how to create and load data into a sample Hive table from an existing file.</p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: sql; gutter: false" style="font-size:12px;">Hive> CREATE TABLE test (name string, type string, supplier_key int, full_price double) row format delimited fields terminated by ',';
Hive> LOAD DATA local inpath '/local/path/data.txt' into table test; </pre>
</div></div><h3 id="PXFInstallationandAdministration-MappingHiveCollectionTypes"><span style="line-height: 1.5625;">Mapping Hive Collection Types</span></h3><p><span style="line-height: 1.5625;">PXF supports Hive data types that are not primitive types. For example</span> <span style="line-height: 1.4285;">:</span></p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: sql; gutter: false" style="font-size:12px;">CREATE TABLE sales_collections (
item STRING,
price FLOAT,
properties ARRAY<STRING>,
hash MAP<STRING,FLOAT>,
delivery_address STRUCT<street:STRING, city:STRING, state:STRING, zip:INT>
)
ROW FORMAT DELIMITED FIELDS
TERMINATED BY '\001' COLLECTION ITEMS TERMINATED BY '\002' MAP KEYS TERMINATED BY '\003' LINES TERMINATED BY '\n' STORED AS TEXTFILE;
LOAD DATA LOCAL INPATH '/local/path/<some data file>' INTO TABLE sales_collection;</pre>
</div></div><p><span style="color: rgb(51,51,51);line-height: 1.4285;font-size: 14.0px;">To query a Hive table schema similar to the one in the example you need to define the PXF external table with attributes corresponding to members in the Hive table array and map fields. For example:</span></p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: sql; gutter: false" style="font-size:12px;">CREATE EXTERNAL TABLE gp_sales_collections(
item_name TEXT,
item_price REAL,
property_type TEXT,
property_color TEXT,
property_material TEXT,
hash_key1 TEXT,
hash_val1 REAL,
hash_key2 TEXT,
hash_val3 REAL,
delivery_street TEXT,
delivery_city TEXT,
delivery_state TEXT,
delivery_zip INTEGER
)
LOCATION ('pxf://<namenode_host>:50070/sales_collections?PROFILE=Hive')
FORMAT 'custom' (FORMATTER='pxfwritable_import');</pre>
</div></div><h3 id="PXFInstallationandAdministration-PartitionFiltering">Partition Filtering</h3><p align="LEFT">The PXF Hive plugin uses the Hive partitioning feature and directory structure. This enables partition exclusion on HDFS files that contain the Hive table. To use the Partition Filtering feature to reduce network traffic and I/O, run a PXF query using a WHERE clause that refers to a specific partition in the partitioned Hive table.</p><p align="LEFT">To take advantage of PXF Partition filtering push down, name the partition fields in the external table. These names must be the same as those stored in the Hive table. Otherwise, PXF ignores Partition filtering and the filtering is performed on the HAWQ side, impacting performance.</p> <div class="aui-message hint shadowed information-macro">
<p class="title">NOTE</p>
<span class="aui-icon icon-hint">Icon</span>
<div class="message-content">
<p align="LEFT">The Hive plugin only filters on partition columns but not on other table attributes.</p>
</div>
</div>
<h4 id="PXFInstallationandAdministration-Example.1">Example</h4><p align="LEFT">Create a Hive tabl<em>e sales_part</em> with 2 partition columns - <em>delivery_state</em> and <em>delivery_city:</em></p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: sql; gutter: false" style="font-size:12px;">CREATE TABLE sales_part (name STRING, type STRING, supplier_key INT, price DOUBLE)
PARTITIONED BY (delivery_state STRING, delivery_city STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
</pre>
</div></div><p>Load data into this Hive table and add some partitions:</p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: plain; gutter: false" style="font-size:12px;">LOAD DATA LOCAL INPATH '/local/path/data1.txt' INTO TABLE sales_part PARTITION(delivery_state = 'CALIFORNIA', delivery_city = 'San Francisco');
LOAD DATA LOCAL INPATH '/local/path/data2.txt' INTO TABLE sales_part PARTITION(delivery_state = 'CALIFORNIA', delivery_city = 'Sacramento');
LOAD DATA LOCAL INPATH '/local/path/data3.txt' INTO TABLE sales_part PARTITION(delivery_state = 'NEVADA' , delivery_city = 'Reno');
LOAD DATA LOCAL INPATH '/local/path/data4.txt' INTO TABLE sales_part PARTITION(delivery_state = 'NEVADA' , delivery_city = 'Las Vegas');</pre>
</div></div><p align="LEFT">The Hive storage directory should appears as follows:</p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: plain; gutter: false" style="font-size:12px;">/hive/warehouse/sales_part/delivery_state=CALIFORNIA/delivery_city=’San Francisco’/data1.txt
/hive/warehouse/sales_part/delivery_state=CALIFORNIA/delivery_city=Sacramento/data2.txt
/hive/warehouse/sales_part/delivery_state=NEVADA/delivery_city=Reno/data3.txt
/hive/warehouse/sales_part/delivery_state=NEVADA/delivery_city=’Las Vegas’/data4.txt</pre>
</div></div><p>To define a PXF table to read this Hive table and take<strong> </strong>advantage of partition filter push down, define the fields corresponding to the Hive partition fields at the end of the attribute list. </p><p>When defining an external table, check that the fields corresponding to the Hive partition fields are at the end of the column list. In HiveQL, issuing a <em>select* </em> statement on a partitioned table shows the partition fields at the end of the record.</p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: sql; gutter: false" style="font-size:12px;">CREATE EXTERNAL TABLE pxf_sales_part(
item_name TEXT,
item_type TEXT,
supplier_key INTEGER,
item_price DOUBLE PRECISION,
delivery_state TEXT,
delivery_city TEXT
)
LOCATION ('pxf://namenode_host:50070/sales_part?Profile=Hive')
FORMAT 'custom' (FORMATTER='pxfwritable_import');
SELECT * FROM pxf_sales_part;</pre>
</div></div><h4 id="PXFInstallationandAdministration-Example.2">Example </h4><p><span style="line-height: 1.4285;">In the following example the HAWQ query filters the <em>delivery_city</em> partition <em>Sacramento</em>. The filter on </span> <em style="line-height: 1.4285;">item_name</em> <span style="line-height: 1.4285;"> is not pushed down since it is not a partition column. It is </span> <span style="line-height: 1.4285;">performed on the HAWQ side after all the data on <em>Sacramento</em> is transferred for </span> <span style="line-height: 1.4285;">processing.</span></p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: sql; gutter: false" style="font-size:12px;">SELECT * FROM pxf_sales_part WHERE delivery_city = 'Sacramento' AND item_name = 'shirt';</pre>
</div></div><h4 id="PXFInstallationandAdministration-Example.3"><span style="line-height: 1.4285;">Example</span></h4><p><span style="line-height: 1.4285;">The following HAWQ query reads all the data under <em>delivery_city</em> partition <em>CALIFORNIA</em>, regardless of the city partiti</span><span style="line-height: 1.4285;">on.</span></p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: sql; gutter: false" style="font-size:12px;">SELECT * FROM pxf_sales_part WHERE delivery_state = 'CALIFORNIA'</pre>
</div></div><h2 id="PXFInstallationandAdministration-AccessingHBaseDatawithPXF">Accessing HBase Data with PXF</h2><h3 id="PXFInstallationandAdministration-InstallingthePXFHBaseplugin">Installing the PXF HBase plugin</h3><p>Install the PXF HBase plugin on all nodes in the cluster:<span> </span></p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: bash; gutter: false" style="font-size:12px;">sudo rpm -i pxf-hbase-2.2.0-x.rpm</pre>
</div></div><ul><li class="Code">PXF RPMs reside in the Pivotal ADS/HAWQ stack file. </li><li>The script installs the JAR file at the default location at <em>/usr/lib/gphd/pxf-2.2.0</em>. The S<em>oftlink pxf-hbase.jar</em> will be created in <em>/usr/lib/gphd/pxf</em><br/><em><br/></em></li></ul> <div class="aui-message hint shadowed information-macro">
<p class="title">PXF HBase Prerequisites</p>
<span class="aui-icon icon-hint">Icon</span>
<div class="message-content">
<p><span>Before using PXF HBase plugin, verify the following:</span></p><ul><li><span>PXF HDFS plugin is installed on the cluster nodes.</span></li><li><span style="line-height: 1.4285;font-size: 14.0px;background-color: transparent;">That you have set the </span><span style="line-height: 1.4285;font-size: 14.0px;background-color: transparent;"> </span><em style="line-height: 1.4285;font-size: 14.0px;background-color: transparent;">Hadoop-env.sh</em><span style="line-height: 1.4285;font-size: 14.0px;background-color: transparent;"> </span><span style="line-height: 1.4285;font-size: 14.0px;background-color: transparent;"> on the Namenode.</span></li><li><span style="line-height: 1.4285;font-size: 14.0px;background-color: transparent;">All Datanodes have the </span><em style="line-height: 1.4285;font-size: 14.0px;background-color: transparent;">hbase.jar</em><span style="line-height: 1.4285;font-size: 14.0px;background-color: transparent;">, </span><em style="line-height: 1.4285;font-size: 14.0px;background-color: transparent;">zookeeper.jar</em><span style="line-height: 1.4285;font-size: 14.0px;background-color: transparent;"> and the </span><em style="line-height: 1.4285;font-size: 14.0px;background-color: transparent;">HBase conf</em><span style="line-height: 1.4285;font-size: 14.0px;background-color: transparent;"> directory set in the </span><em style="line-height: 1.4285;font-size: 14.0px;background-color: transparent;">HADOOP_CLASSPATH</em><span style="line-height: 1.4285;font-size: 14.0px;background-color: transparent;">. </span></li></ul><p><span>See <em>Installing PXF</em> for more information.</span></p>
</div>
</div>
<h3 id="PXFInstallationandAdministration-Syntax.2">Syntax</h3><p><span style="color: rgb(51,51,51);line-height: 1.4285;font-size: 14.0px;"> </span> <span style="color: rgb(51,51,51);line-height: 1.4285;font-size: 14.0px;">To query an <em>HBase</em> table use the following syntax:</span></p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: sql; gutter: false" style="font-size:12px;">CREATE EXTERNAL TABLE <pxf tblname> (<col list - see details below>)
LOCATION ('pxf://<NN REST host>:<NN REST port>/<HBase table name>?PROFILE=HBase')
FORMAT 'CUSTOM' (FORMATTER='pxfwritable_import');
SELECT * FROM <pxf tblname>;</pre>
</div></div><h2 id="PXFInstallationandAdministration-ColumnMapping">Column Mapping</h2><p>Most HAWQ external tables (PXF or others) require that the HAWQ table attributes match the source data record layout, and include all the available attributes. However, use the PXF HBase plugin to specify the subset of HBase qualifiers to define the HAWQ PXF table. To set up a clear mapping between each attribute in the PXF table and a specific qualifier in the HBase table, you can use either:</p><ul><li>Directmapping</li><li>Indirect mapping</li></ul><p>In addition, the HBase row key is handled in a special way.</p><h2 id="PXFInstallationandAdministration-RowKey">Row Key</h2><p>You can use the HBase table row key in several ways. For example, you can see them using query results, or you can run a WHERE clause filter on a range of row key values. To use the row key in the HAWQ query, define the HAWQ table with the reserved PXF attribute <em>recordkey.</em> This attribute name tells PXF to return the record key in any key-value based system and in HBase.</p> <div class="aui-message hint shadowed information-macro">
<p class="title">NOTE</p>
<span class="aui-icon icon-hint">Icon</span>
<div class="message-content">
<p>Since HBase is byte and not character-based Pivotal recommends that you define the <em>recordkey</em> as type <em>bytea</em>. This may result in better ability to filter data and increase performance.</p>
</div>
</div>
<div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: sql; gutter: false" style="font-size:12px;">CREATE EXTERNAL TABLE <tname> (recordkey bytea, ... ) LOCATION ('pxf:// ...')</pre>
</div></div><h3 id="PXFInstallationandAdministration-DirectMapping">Direct Mapping</h3><p>Use Direct Mapping to map HAWQ table attributes to HBase qualifiers. You can specify the HBase qualifier names of interest, with column family names included, as quoted values. </p><p>For example, you have defined an <span>HBase table called <em>hbase_sales</em> with multiple column families and many qualifiers. To see the following in the resulting attribute section of the CREATE EXTERNAL TABLE:</span></p><ul><li><span><em>rowkey</em></span></li><li><span>qualifier <em>saleid</em> in the column family <em>cf1</em></span></li><li><span>qualifier<em> comments</em> in the column family<em> cf8</em> </span></li></ul><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: sql; gutter: false" style="font-size:12px;">CREATE EXTERNAL TABLE hbase_sales (
recordkey bytea,
“cf1:saleid” int,
“cf8:comments” varchar
) ...</pre>
</div></div><p>The PXF HBase plugin uses these attribute names as-is and returns the values of these HBase qualifiers.</p><h3 id="PXFInstallationandAdministration-IndirectMapping(viaLookupTable)">Indirect Mapping (via Lookup Table)</h3><p>Direct mapping method is fast and intuitive, but using indirect mapping helps to reconcile HBase qualifier names with HAWQ behavior:</p><ul><li>HBase qualifier names that are longer than 32 characters. HAWQ has a 32 character limit on attribute name size.</li><li>HBase qualifier names can be binary or non-printable. HAWQ attribute names are character based.</li></ul><p>In either case, Indirect Mapping uses a lookup table on HBase. You can create the lookup table to store all necessary lookup information. This works as a template for any future queries. The name of the lookup table must be <em>pxflookup </em>and must include the column family named <em>mapping. </em></p><p>Using the sales example in Direct Mapping, if our<em> rowkey</em> represents the HBase table name and the <em>mapping </em>column family includes the actual attribute mapping in the key value form of <em><hawq attr name>=<hbase cf:qualifier>. </em></p><h4 id="PXFInstallationandAdministration-Example.4">Example</h4><div class="table-wrap"><table class="confluenceTable"><tbody><tr><th class="confluenceTh">(row key)</th><th class="confluenceTh">mapping</th></tr><tr><td class="confluenceTd">sales</td><td class="confluenceTd">id=cf1:saleid</td></tr><tr><td class="confluenceTd">sales</td><td class="confluenceTd">cmts=cf8:comments</td></tr></tbody></table></div><p><strong>Note</strong>: The mapping assigned new names for each qualifier. You can use these names in your HAWQ table definition:</p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: sql; gutter: false" style="font-size:12px;">CREATE EXTERNAL TABLE hbase_sales (
recordkey bytea
id int,
cmts varchar
) ...</pre>
</div></div><p>PXF automatically matches HAWQ to HBase column names when a <em>pxflookup </em>table exists in HBase.</p><h2 id="PXFInstallationandAdministration-AccessingGemFireDatawithPXF">Accessing GemFire Data with PXF</h2> <div class="aui-message hint shadowed information-macro">
<p class="title">NOTE</p>
<span class="aui-icon icon-hint">Icon</span>
<div class="message-content">
<p><span>Before using PXF GemFire plugin, verify the following:</span></p><p><span> - That you have installed the </span> <em>gfxd rpm </em> <span> on the Namenode and on the Datanodes.</span></p><p><span> - The namenode and all Datanodes have the <em>sqlfire.jar</em> set in the <em>HADOOP_CLASSPATH</em>. </span></p><p><span>See <em>Installing PXF</em> for more information.</span></p>
</div>
</div>
<h3 id="PXFInstallationandAdministration-Syntax.3">Syntax</h3><p><span style="color: rgb(51,51,51);line-height: 1.4285;font-size: 14.0px;"> </span> <span style="color: rgb(51,51,51);line-height: 1.4285;font-size: 14.0px;">To query an <em>GemFire</em> table use the following syntax:</span></p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: sql; gutter: false" style="font-size:12px;">CREATE EXTERNAL TABLE <pxf tblname> (<col list>)
LOCATION ('pxf://<NN REST host>:<NN REST port>/<GemFire table name>?Profile=GemFireXD&<GemFire specific connector options>')
FORMAT 'CUSTOM' (FORMATTER='pxfwritable_import');
SELECT * FROM <pxf tblname>;</pre>
</div></div><p>The GemfireXD connector has quite a few connector options that can be used in the LOCATION URI, and are well documented in the GemFire document itself. It is highly recommended to learn them carefully in order to get the expected behavior when querying GemFireXD data through PXF.</p><h2 id="PXFInstallationandAdministration-Troubleshooting">Troubleshooting</h2><p>The following table describes some common errors while using PXF:</p><p><strong>Table: PXF Errors and Explanation</strong></p><div class="table-wrap"><table class="confluenceTable"><tbody><tr><td class="confluenceTd"><p align="LEFT"><strong>Error</strong></p></td><td class="confluenceTd"><p align="LEFT"><strong>Common Explanation</strong></p></td></tr><tr><td class="confluenceTd"><p align="LEFT"><em>ERROR: invalid URI pxf://localhost:50070/demo/file1: missing options section</em></p></td><td class="confluenceTd"><p align="LEFT"><em>LOCATION</em> does not include options after the file name: <em><path>?<key>=<value>&<key>=<value>...</em></p></td></tr><tr><td class="confluenceTd"><p align="LEFT"><em>ERROR: protocol "pxf" does not exist</em></p></td><td class="confluenceTd"><p align="LEFT">HAWQ is not compiled with PXF<span style="font-size: xx-small;"> </span> protocol. It requires the GPSQL version of HAWQ version.</p></td></tr><tr><td class="confluenceTd"><p align="LEFT"><em>ERROR: remote component error: 0</em></p><p align="LEFT">DETAIL: There is no pxf servlet listening on the host and port specified in the external table url.</p></td><td class="confluenceTd"><p align="LEFT">Wrong server or port or the service is not started.</p><p align="LEFT"> </p></td></tr><tr><td class="confluenceTd"><p align="LEFT"><em>ERROR: Missing FRAGMENTER option in the pxf uri: pxf://localhost:50070/demo/file1?a=a</em></p></td><td class="confluenceTd"><p align="LEFT">No <em>FRAGMENTER</em> option was specified in <em>LOCATION</em>.</p></td></tr><tr><td class="confluenceTd"><p align="LEFT"><em>ERROR: remote component error: 500</em></p><p align="LEFT"><span style="line-height: 1.4285;">DETAIL: Problem accessing <em>/gpdb/v<X>/Fragmenter/getFragments</em>. </span> <span style="line-height: 1.4285;">Reason:<em>org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://0.0.0.0:8020/demo/file1 </em> </span></p></td><td class="confluenceTd"><p align="LEFT">File or pattern given in<em> LOCATION</em> doesn't exist on specified path. </p></td></tr><tr><td class="confluenceTd"><p align="LEFT">ERROR: <em> remote component error: 500</em></p><p align="LEFT"><span style="line-height: 1.4285;">DETAIL: Problem accessing<em> /gpdb/v<X>/Fragmenter/getFragments</em>. Reason: <em>org.apache.hadoop.mapred.InvalidInputException: Input Pattern hdfs://0.0.0.0:8020/demo/file*</em> matches 0 files </span></p></td><td class="confluenceTd"><p align="LEFT">File or pattern given in <em>LOCATION</em> doesn't exist on specified path.</p><p align="LEFT"> </p></td></tr><tr><td class="confluenceTd" colspan="1"><em>ERROR: remote component error: 500 <span style="color: rgb(0,0,0);">PXF not correctly installed in CLASSPATH</span> </em></td><td class="confluenceTd" colspan="1">Cannot find PXF Jar</td></tr><tr><td class="confluenceTd"><p align="LEFT"><em>ERROR: GPHD component not found</em></p></td><td class="confluenceTd"><p align="LEFT">Either the required data node does not exist or REST service on data node is not started or PXF JAR cannot be found on data node</p></td></tr><tr><td class="confluenceTd" colspan="1"><pre>ERROR: remote component error (500) </pre><pre>Problem accessing /gpdb/v<X>/Fragmenter/getFragments. Reason: java.io.IOException: Can't get Master Kerberos principal for use as renewer</pre></td><td class="confluenceTd" colspan="1">Secure PXF: YARN isn't properly configured for secure (Kerberized) HDFS installs</td></tr><tr><td class="confluenceTd" colspan="1"><pre>ERROR: fail to get filesystem credential for uri hdfs://<namenode>:8020/</pre></td><td class="confluenceTd" colspan="1">Secure PXF: Wrong HDFS host or port is not 8020 (see limitations of Secure PXF)</td></tr><tr><td class="confluenceTd" colspan="1"><pre>ERROR: remote component error (413) from <x>: HTTP status code is 413 but HTTP response string is empty</pre></td><td class="confluenceTd" colspan="1">The PXF table number of attributes and their name sizes are too large for Jetty to accommodate in its request buffer. The solution is to increase the value of the <span style="color: rgb(0,0,0);">http.header.size parameter on </span><span style="color: rgb(0,0,0);">core-site.xml on all nodes and then restart hadoop:</span><br/><span style="color: rgb(0,0,0);"><property></span><br/><span style="color: rgb(0,0,0);"><name>http.header.size</name></span><br/><span style="color: rgb(0,0,0);"><value><some larger value></value></span><br/><span style="color: rgb(0,0,0);"></property></span></td></tr><tr><td class="confluenceTd" colspan="2"><p align="LEFT"><strong>HBase Specific Errors</strong></p></td></tr><tr><td class="confluenceTd"><p align="LEFT"><em>ERROR: remote component error: 500</em></p><p align="LEFT">DETAIL: Problem accessing <em>/gpdb/v<X>/Fragmenter/getFragments</em>. Reason: <em>org.apache.hadoop.hbase.client.NoServerForRegionException: Unable to find region for t1,,99999999999999</em> after 10 tries.</p></td><td class="confluenceTd"><p align="LEFT">HBase service is down, probably HRegionServer.</p><p align="LEFT"> </p><p align="LEFT"> </p></td></tr><tr><td class="confluenceTd"><p align="LEFT"><em>ERROR: remote component error: 500</em></p><p align="LEFT">DETAIL: Problem accessing <em>/gpdb/v<X>/Fragmenter/getFragments</em>. Reason: <em>org.apache.hadoop.hbase.TableNotFoundException:</em> nosuch</p></td><td class="confluenceTd"><p align="LEFT">HBase cannot find the requested table</p><p align="LEFT"> </p></td></tr><tr><td class="confluenceTd"><p align="LEFT"><em>ERROR: remote component error: 500 (/HTTP/1.1 500 Internal Server Error</em></p></td><td class="confluenceTd"><p align="LEFT">PXF cannot find a required JAR file, probably HBase's</p></td></tr><tr><td class="confluenceTd"><p align="LEFT"><em>ERROR: remote component error: 500 <span style="line-height: 1.4285;">(/HTTP/1.1 500 org/apache/zookeeper/KeeperException</span> </em></p></td><td class="confluenceTd"><p align="LEFT">PXF cannot find Zookeeper's JAR</p></td></tr><tr><td class="confluenceTd"><p align="LEFT"><em>ERROR: remote component error: 500 <span style="line-height: 1.4285;">(/HTTP/1.1 500 Illegal HBase column name name</span> </em></p></td><td class="confluenceTd"><p align="LEFT">Required HBase table does not contain the requested column</p></td></tr><tr><td class="confluenceTd" colspan="2"><p align="LEFT"><strong>Hive Specific Errors</strong></p></td></tr><tr><td class="confluenceTd"><p align="LEFT"><em>ERROR: remote component error: 500</em></p><p align="LEFT">DETAIL: Problem accessing <em>/gpdb/v<X>/Fragmenter/getFragments</em>. Reason: <em>java.net.ConnectException:</em> Connection refused</p></td><td class="confluenceTd"><p align="LEFT">Hive Metastore service is down.</p><p align="LEFT"> </p></td></tr><tr><td class="confluenceTd"><p align="LEFT"><em>ERROR: remote component error: 500</em></p><p align="LEFT">DETAIL: Problem accessing <em>/gpdb/v<X>/Fragmenter/getFragments</em>. Reason: <em>NoSuchObjectException(<a rel="nofollow">message:default.players</a> table not found)</em></p></td><td class="confluenceTd"><p align="LEFT">Table doesn't exist in Hive.</p></td></tr><tr><td class="confluenceTd" colspan="2"><strong>GemfireXD Specific Errors</strong></td></tr><tr><td class="confluenceTd" colspan="1">No data or wrong data comes back, comes back with very poor performance,</td><td class="confluenceTd" colspan="1">See GemFireXD connector documentation as part of the GemFireXF product document. There are various GemFireXD connector options that need to be used properly in order to get the right results with good performance.</td></tr></tbody></table></div><p><span style="font-size: xx-small;"> <span style="font-size: small;"> </span> </span></p><div><span style="color: rgb(0,0,0);line-height: 30.0px;font-size: 24.0px;"> <br/> </span></div><p> </p><p> </p><p> </p><p> </p><p> </p><p> </p>
</div></div>
</div><!-- end of content-->
</div><!-- end of container -->
</div><!--end of container-fluid-->
</div><!--end of main-wrap-->
<div class="site-footer desktop-only">
<div class="container-fluid">
<div class="site-footer-links">
<span class="version"><a href='/'>Pivotal Documentation</a></span>
<span>©
<script>
var d = new Date();
document.write(d.getFullYear());
</script>
<a href='http://gopivotal.com'>Pivotal Software</a> Inc. All Rights Reserved.
</span>
</div>
</div>
</div>
<script type="text/javascript">
(function() {
var didInit = false;
function initMunchkin() {
if(didInit === false) {
didInit = true;
Munchkin.init('625-IUJ-009');
}
}
var s = document.createElement('script');
s.type = 'text/javascript';
s.async = true;
s.src = document.location.protocol + '//munchkin.marketo.net/munchkin.js';
s.onreadystatechange = function() {
if (this.readyState == 'complete' || this.readyState == 'loaded') {
initMunchkin();
}
};
s.onload = initMunchkin;
document.getElementsByTagName('head')[0].appendChild(s);
})();
</script>
</div><!--end of viewport-->
<div id="scrim"></div>
</body>
</html>