From aa3a7368b3d75ba872ce55d116d2ad8f4d907d05 Mon Sep 17 00:00:00 2001 From: Vasilis Efthymiou Date: Sat, 30 Jul 2016 18:29:37 -0400 Subject: [PATCH] added as maven project --- MetaBlocking/nbactions.xml | 46 +++ MetaBlocking/pom.xml | 60 +++ .../java/advanced/AverageWeightCombiner.java | 34 ++ .../java/advanced/AverageWeightDriver.java | 123 ++++++ .../java/advanced/AverageWeightEJSDriver.java | 99 +++++ .../java/advanced/AverageWeightEJSMapper.java | 151 +++++++ .../java/advanced/AverageWeightMapper.java | 151 +++++++ .../AverageWeightMapperNewFromCompressed.java | 129 ++++++ .../java/advanced/AverageWeightReducer.java | 50 +++ .../main/java/advanced/CEPCountingDriver.java | 107 +++++ .../java/advanced/CEPCountingEJSDriver.java | 98 +++++ .../src/main/java/advanced/CEPEJSMapper.java | 147 +++++++ .../main/java/advanced/CEPFinalDriver.java | 106 +++++ .../main/java/advanced/CEPFinalEJSDriver.java | 111 ++++++ .../main/java/advanced/CEPFinalEJSMapper.java | 148 +++++++ .../java/advanced/CEPFinalEJSMapperOnly.java | 145 +++++++ .../main/java/advanced/CEPFinalMapper.java | 141 +++++++ .../main/java/advanced/CEPFinalMapperNew.java | 119 ++++++ .../java/advanced/CEPFinalMapperNewEJS.java | 121 ++++++ .../java/advanced/CEPFinalMapperOnly.java | 138 +++++++ .../java/advanced/CEPFinalMapperOnlyNew.java | 116 ++++++ .../advanced/CEPFinalMapperOnlyNewEJS.java | 119 ++++++ .../src/main/java/advanced/CEPMapper.java | 138 +++++++ .../src/main/java/advanced/CEPMapperNew.java | 113 ++++++ .../main/java/advanced/CEPMapperNewEJS.java | 114 ++++++ MetaBlocking/src/main/java/advanced/CNP.java | 68 ++++ .../src/main/java/advanced/CNPDriver.java | 90 +++++ .../src/main/java/advanced/CNPEJSDriver.java | 84 ++++ .../src/main/java/advanced/NPMapper.java | 160 ++++++++ .../src/main/java/advanced/NPMapperEJS.java | 159 ++++++++ .../java/advanced/NPMapperFromCompressed.java | 153 ++++++++ .../src/main/java/advanced/NPMapperNew.java | 108 +++++ .../advanced/NPMapperNewFromCompressed.java | 168 ++++++++ .../src/main/java/advanced/PCNPDriver.java | 78 ++++ .../src/main/java/advanced/PNPMapper.java | 104 +++++ .../src/main/java/advanced/PWNPDriver.java | 61 +++ .../src/main/java/advanced/WEPDriver.java | 73 ++++ .../src/main/java/advanced/WEPMapper.java | 106 +++++ .../src/main/java/advanced/WEPMapperOnly.java | 36 ++ .../src/main/java/advanced/WEPReducer.java | 52 +++ .../src/main/java/advanced/WNPDriver.java | 86 ++++ .../src/main/java/advanced/WNPEJSDriver.java | 80 ++++ .../src/main/java/advanced/WNPMapper.java | 108 +++++ .../src/main/java/advanced/WNPReducer.java | 31 ++ .../main/java/blockingGraphBuilding/ARCS.java | 33 ++ .../blockingGraphBuilding/ARCSDriver.java | 66 ++++ .../blockingGraphBuilding/ARCSMapper.java | 40 ++ .../blockingGraphBuilding/ARCSReducer.java | 79 ++++ .../ARCSReducerDirty.java | 70 ++++ .../AllBlockComparisonsDriver.java | 93 +++++ .../AllBlockComparisonsDriverBalanced.java | 94 +++++ ...lockComparisonsDriverBalancedAdvanced.java | 179 +++++++++ .../AllBlockComparisonsMapper.java | 44 +++ .../AllBlockComparisonsParitioner.java | 93 +++++ .../AllBlockComparisonsReducer.java | 82 ++++ .../AllBlockComparisonsReducerDirty.java | 78 ++++ .../blockingGraphBuilding/BlockingGraph.java | 80 ++++ .../BlockingGraphARCS.java | 64 +++ .../BlockingGraphEJS.java | 63 +++ .../main/java/blockingGraphBuilding/CBS.java | 33 ++ .../main/java/blockingGraphBuilding/ECBS.java | 47 +++ .../main/java/blockingGraphBuilding/EJS.java | 59 +++ .../java/blockingGraphBuilding/EJSDriver.java | 57 +++ .../java/blockingGraphBuilding/EJSMapper.java | 35 ++ .../blockingGraphBuilding/EJSReducer.java | 62 +++ .../main/java/blockingGraphBuilding/JS.java | 37 ++ .../blockingGraphBuilding/SumCombiner.java | 27 ++ .../AverageWeightCombiner.java | 34 ++ .../AverageWeightDriver.java | 57 +++ .../AverageWeightMapper.java | 37 ++ .../AverageWeightReducer.java | 49 +++ .../blockingGraphPruning/CEPCombiner.java | 31 ++ .../CEPCountingDriver.java | 75 ++++ .../blockingGraphPruning/CEPFinalDriver.java | 84 ++++ .../blockingGraphPruning/CEPFinalMapper.java | 37 ++ .../CEPFinalMapperOnly.java | 36 ++ .../blockingGraphPruning/CEPFinalReducer.java | 47 +++ .../java/blockingGraphPruning/CEPMapper.java | 31 ++ .../java/blockingGraphPruning/CEPReducer.java | 46 +++ .../blockingGraphPruning/CEPTotalOrder.java | 75 ++++ .../main/java/blockingGraphPruning/CNP.java | 63 +++ .../java/blockingGraphPruning/CNPDriver.java | 80 ++++ .../java/blockingGraphPruning/NPMapper.java | 41 ++ .../java/blockingGraphPruning/PCNPDriver.java | 59 +++ .../java/blockingGraphPruning/PNPMapper.java | 41 ++ .../java/blockingGraphPruning/PWNPDriver.java | 59 +++ .../java/blockingGraphPruning/Reciprocal.java | 37 ++ .../ReciprocalDriver.java | 54 +++ .../main/java/blockingGraphPruning/WEP.java | 54 +++ .../java/blockingGraphPruning/WEPDriver.java | 59 +++ .../main/java/blockingGraphPruning/WNP.java | 65 +++ .../java/blockingGraphPruning/WNPDriver.java | 64 +++ .../java/entityBased/EntityBasedDriver.java | 84 ++++ .../EntityBasedDriverAverageWeight.java | 113 ++++++ .../EntityBasedDriverAverageWeightARCS.java | 92 +++++ .../EntityBasedDriverAverageWeightEJS.java | 121 ++++++ .../entityBased/EntityBasedDriverCEP1.java | 92 +++++ .../EntityBasedDriverCEP1ARCS.java | 79 ++++ .../entityBased/EntityBasedDriverCEP1EJS.java | 107 +++++ .../entityBased/EntityBasedDriverCEP2.java | 97 +++++ .../entityBased/EntityBasedDriverCEP3.java | 121 ++++++ .../EntityBasedDriverCEP3ARCS.java | 103 +++++ .../entityBased/EntityBasedDriverCEP3EJS.java | 111 ++++++ .../entityBased/EntityBasedDriverCNP.java | 118 ++++++ .../entityBased/EntityBasedDriverCNPARCS.java | 86 ++++ .../entityBased/EntityBasedDriverCNPEJS.java | 110 ++++++ .../entityBased/EntityBasedDriverWEP.java | 108 +++++ .../entityBased/EntityBasedDriverWEPARCS.java | 87 ++++ .../entityBased/EntityBasedDriverWEPEJS.java | 115 ++++++ .../entityBased/EntityBasedDriverWNP.java | 89 +++++ .../entityBased/EntityBasedDriverWNPARCS.java | 63 +++ .../entityBased/EntityBasedDriverWNPEJS.java | 101 +++++ .../entityBased/EntityBasedIndexDriver.java | 60 +++ .../entityBased/EntityBasedIndexMapper.java | 40 ++ .../entityBased/EntityBasedIndexReducer.java | 129 ++++++ .../EntityBasedIndexReducerMemory.java | 102 +++++ .../java/entityBased/EntityBasedMapper.java | 60 +++ .../EntityBasedMapperFromCompressed.java | 56 +++ .../EntityBasedMapperFromCompressedNP.java | 39 ++ ...yBasedMapperFromCompressedNPARCSClean.java | 84 ++++ ...ntityBasedMapperFromCompressedNPClean.java | 74 ++++ .../java/entityBased/EntityBasedReducer.java | 47 +++ .../EntityBasedReducerAverageWeight.java | 130 ++++++ ...ityBasedReducerAverageWeightARCSClean.java | 63 +++ ...ityBasedReducerAverageWeightARCSDirty.java | 62 +++ .../EntityBasedReducerAverageWeightEJS.java | 131 +++++++ .../entityBased/EntityBasedReducerCEP.java | 128 ++++++ .../EntityBasedReducerCEPARCSClean.java | 62 +++ .../EntityBasedReducerCEPARCSDirty.java | 61 +++ .../entityBased/EntityBasedReducerCEPEJS.java | 128 ++++++ .../EntityBasedReducerCEPFinal.java | 136 +++++++ .../EntityBasedReducerCEPFinalARCSClean.java | 76 ++++ .../EntityBasedReducerCEPFinalARCSDirty.java | 76 ++++ .../EntityBasedReducerCEPFinalEJS.java | 129 ++++++ .../entityBased/EntityBasedReducerCNP.java | 164 ++++++++ .../EntityBasedReducerCNPARCSClean.java | 83 ++++ .../EntityBasedReducerCNPARCSDirty.java | 81 ++++ .../entityBased/EntityBasedReducerCNPEJS.java | 162 ++++++++ .../entityBased/EntityBasedReducerWEP.java | 136 +++++++ .../EntityBasedReducerWEPARCSClean.java | 73 ++++ .../EntityBasedReducerWEPARCSDirty.java | 71 ++++ .../entityBased/EntityBasedReducerWEPEJS.java | 141 +++++++ .../entityBased/EntityBasedReducerWNP.java | 155 ++++++++ .../EntityBasedReducerWNPARCSClean.java | 76 ++++ .../EntityBasedReducerWNPARCSDirty.java | 74 ++++ .../entityBased/EntityBasedReducerWNPEJS.java | 144 +++++++ .../java/entityBased/NodeDegreeDriver.java | 99 +++++ .../java/entityBased/NodeDegreeReducer.java | 52 +++ .../DescendingDoubleComparator.java | 30 ++ .../hadoopUtils/DescendingVIntComparator.java | 30 ++ .../main/java/hadoopUtils/InverseReducer.java | 25 ++ .../src/main/java/hadoopUtils/MBTools.java | 370 ++++++++++++++++++ .../main/java/hadoopUtils/MapSortByValue.java | 38 ++ .../src/main/java/hadoopUtils/Partition.java | 38 ++ .../java/hadoopUtils/PartitionComparator.java | 17 + .../java/hadoopUtils/ReadHadoopStats.java | 59 +++ .../RelativePositionCompression.java | 257 ++++++++++++ .../java/hadoopUtils/ValueComparator.java | 21 + ...erFilteringBlockSizeByteCounterMapper.java | 42 ++ ...rFilteringBlockSizeByteCounterReducer.java | 84 ++++ ...eringBlockSizeByteCounterReducerDirty.java | 66 ++++ .../AfterFilteringBlockSizeCounterMapper.java | 42 ++ ...AfterFilteringBlockSizeCounterReducer.java | 35 ++ .../AfterFilteringByteCounter.java | 51 +++ .../preprocessing/AfterFilteringCounter.java | 46 +++ .../BasicEntityPruningCombiner.java | 70 ++++ .../BasicEntityPruningDriver.java | 65 +++ .../BasicEntityPruningMapper.java | 67 ++++ .../BasicEntityPruningReducer.java | 68 ++++ .../BasicEntityPruningReducerNew.java | 73 ++++ .../preprocessing/BlockSizeCounterDriver.java | 49 +++ .../preprocessing/BlockSizeCounterMapper.java | 70 ++++ .../BlocksFromEntityIndexDriver.java | 102 +++++ ...ityIndexDriverBalancedFixedPartitions.java | 216 ++++++++++ .../BlocksFromEntityIndexDriverMaxBlock.java | 238 +++++++++++ .../BlocksFromEntityIndexMapper.java | 37 ++ .../BlocksFromEntityIndexParitioner.java | 93 +++++ .../BlocksFromEntityIndexReducer.java | 64 +++ .../java/preprocessing/BlocksPerEntity.java | 92 +++++ .../main/java/preprocessing/EJSDriver.java | 88 +++++ .../java/preprocessing/EJSFinalDriver.java | 65 +++ .../main/java/preprocessing/EJSMapper.java | 164 ++++++++ .../main/java/preprocessing/EJSReducer.java | 51 +++ .../preprocessing/EntityIdsToIntDriver.java | 50 +++ .../preprocessing/EntityIdsToIntMapper.java | 63 +++ .../java/preprocessing/EntityIndexDriver.java | 106 +++++ .../preprocessing/EntityIndexDriverARCS.java | 98 +++++ .../java/preprocessing/EntityIndexMapper.java | 40 ++ .../preprocessing/EntityIndexMapperARCS.java | 47 +++ .../preprocessing/EntityIndexReducer.java | 113 ++++++ .../preprocessing/EntityIndexReducerARCS.java | 120 ++++++ .../EntityIndexReducerNoFiltering.java | 57 +++ .../preprocessing/EntityPruningCombiner.java | 79 ++++ .../EntityPruningDirtyFinalMapper.java | 95 +++++ .../preprocessing/EntityPruningDriver.java | 70 ++++ .../EntityPruningFinalDriver.java | 113 ++++++ .../EntityPruningFinalMapper.java | 109 ++++++ .../EntityPruningFinalReducer.java | 33 ++ .../preprocessing/EntityPruningMapper.java | 107 +++++ .../preprocessing/EntityPruningMapperNew.java | 112 ++++++ .../preprocessing/EntityPruningReducer.java | 109 ++++++ .../EntityPruningReducerNew.java | 52 +++ .../preprocessing/ExtendedInputDriver.java | 105 +++++ .../preprocessing/ExtendedInputMapper.java | 47 +++ .../ExtendedInputMapperARCS.java | 46 +++ .../preprocessing/ExtendedInputReducer.java | 60 +++ .../ExtendedInputReducerNew.java | 66 ++++ .../java/preprocessing/TextToSequence.java | 54 +++ .../preprocessing/TextToSequenceMapper.java | 44 +++ .../TextToSequenceMapperArrayWritable.java | 50 +++ .../java/preprocessing/VIntArrayWritable.java | 75 ++++ MetaBlocking/target/MetaBlocking-1.0.jar | Bin 0 -> 472091 bytes .../target/classes/.netbeans_automatic_build | 0 .../advanced/AverageWeightCombiner.class | Bin 0 -> 2223 bytes .../advanced/AverageWeightDriver.class | Bin 0 -> 6133 bytes .../advanced/AverageWeightEJSDriver.class | Bin 0 -> 5251 bytes .../AverageWeightEJSMapper$OutputData.class | Bin 0 -> 1115 bytes .../AverageWeightEJSMapper$Weight.class | Bin 0 -> 1088 bytes .../advanced/AverageWeightEJSMapper.class | Bin 0 -> 6174 bytes .../AverageWeightMapper$OutputData.class | Bin 0 -> 1145 bytes .../advanced/AverageWeightMapper$Weight.class | Bin 0 -> 1064 bytes .../advanced/AverageWeightMapper.class | Bin 0 -> 6020 bytes ...htMapperNewFromCompressed$OutputData.class | Bin 0 -> 1281 bytes ...WeightMapperNewFromCompressed$Weight.class | Bin 0 -> 1200 bytes ...AverageWeightMapperNewFromCompressed.class | Bin 0 -> 5120 bytes .../advanced/AverageWeightReducer.class | Bin 0 -> 3602 bytes .../classes/advanced/CEPCountingDriver.class | Bin 0 -> 5095 bytes .../advanced/CEPCountingEJSDriver.class | Bin 0 -> 4831 bytes .../advanced/CEPEJSMapper$OutputData.class | Bin 0 -> 1035 bytes .../classes/advanced/CEPEJSMapper.class | Bin 0 -> 6294 bytes .../classes/advanced/CEPFinalDriver.class | Bin 0 -> 5268 bytes .../classes/advanced/CEPFinalEJSDriver.class | Bin 0 -> 5431 bytes .../CEPFinalEJSMapper$OutputData.class | Bin 0 -> 1075 bytes .../classes/advanced/CEPFinalEJSMapper.class | Bin 0 -> 6539 bytes .../CEPFinalEJSMapperOnly$OutputData.class | Bin 0 -> 1164 bytes .../advanced/CEPFinalEJSMapperOnly.class | Bin 0 -> 6327 bytes .../advanced/CEPFinalMapper$OutputData.class | Bin 0 -> 1051 bytes .../classes/advanced/CEPFinalMapper.class | Bin 0 -> 5956 bytes .../CEPFinalMapperNew$OutputData.class | Bin 0 -> 1075 bytes .../classes/advanced/CEPFinalMapperNew.class | Bin 0 -> 4846 bytes .../CEPFinalMapperNewEJS$OutputData.class | Bin 0 -> 1099 bytes .../advanced/CEPFinalMapperNewEJS.class | Bin 0 -> 4979 bytes .../CEPFinalMapperOnly$OutputData.class | Bin 0 -> 1083 bytes .../classes/advanced/CEPFinalMapperOnly.class | Bin 0 -> 5976 bytes .../CEPFinalMapperOnlyNew$OutputData.class | Bin 0 -> 1107 bytes .../advanced/CEPFinalMapperOnlyNew.class | Bin 0 -> 4866 bytes .../CEPFinalMapperOnlyNewEJS$OutputData.class | Bin 0 -> 1131 bytes .../advanced/CEPFinalMapperOnlyNewEJS.class | Bin 0 -> 4999 bytes .../advanced/CEPMapper$OutputData.class | Bin 0 -> 1011 bytes .../target/classes/advanced/CEPMapper.class | Bin 0 -> 5693 bytes .../advanced/CEPMapperNew$OutputData.class | Bin 0 -> 1035 bytes .../classes/advanced/CEPMapperNew.class | Bin 0 -> 4627 bytes .../advanced/CEPMapperNewEJS$OutputData.class | Bin 0 -> 1059 bytes .../classes/advanced/CEPMapperNewEJS.class | Bin 0 -> 4582 bytes .../target/classes/advanced/CNP.class | Bin 0 -> 4425 bytes .../target/classes/advanced/CNPDriver.class | Bin 0 -> 4756 bytes .../classes/advanced/CNPEJSDriver.class | Bin 0 -> 4391 bytes .../classes/advanced/NPMapper$InputData.class | Bin 0 -> 994 bytes .../advanced/NPMapper$OutputData.class | Bin 0 -> 1003 bytes .../classes/advanced/NPMapper$Weight.class | Bin 0 -> 976 bytes .../target/classes/advanced/NPMapper.class | Bin 0 -> 6243 bytes .../advanced/NPMapperEJS$OutputData.class | Bin 0 -> 1027 bytes .../target/classes/advanced/NPMapperEJS.class | Bin 0 -> 6110 bytes .../NPMapperFromCompressed$OutputData.class | Bin 0 -> 1115 bytes .../NPMapperFromCompressed$Weight.class | Bin 0 -> 1088 bytes .../advanced/NPMapperFromCompressed.class | Bin 0 -> 6051 bytes .../advanced/NPMapperNew$InputData.class | Bin 0 -> 1018 bytes .../advanced/NPMapperNew$OutputData.class | Bin 0 -> 1027 bytes .../classes/advanced/NPMapperNew$Weight.class | Bin 0 -> 1000 bytes .../target/classes/advanced/NPMapperNew.class | Bin 0 -> 4860 bytes .../NPMapperNewFromCompressed$InputData.class | Bin 0 -> 1130 bytes ...NPMapperNewFromCompressed$OutputData.class | Bin 0 -> 1139 bytes .../NPMapperNewFromCompressed$Weight.class | Bin 0 -> 1112 bytes .../advanced/NPMapperNewFromCompressed.class | Bin 0 -> 5462 bytes .../target/classes/advanced/PCNPDriver.class | Bin 0 -> 3788 bytes .../advanced/PNPMapper$OutputData.class | Bin 0 -> 1011 bytes .../classes/advanced/PNPMapper$Weight.class | Bin 0 -> 984 bytes .../target/classes/advanced/PNPMapper.class | Bin 0 -> 5670 bytes .../target/classes/advanced/PWNPDriver.class | Bin 0 -> 2668 bytes .../target/classes/advanced/WEPDriver.class | Bin 0 -> 3389 bytes .../advanced/WEPMapper$OutputData.class | Bin 0 -> 1011 bytes .../classes/advanced/WEPMapper$Weight.class | Bin 0 -> 984 bytes .../target/classes/advanced/WEPMapper.class | Bin 0 -> 5948 bytes .../advanced/WEPMapperOnly$OutputData.class | Bin 0 -> 1044 bytes .../classes/advanced/WEPMapperOnly.class | Bin 0 -> 2708 bytes .../target/classes/advanced/WEPReducer.class | Bin 0 -> 3651 bytes .../target/classes/advanced/WNPDriver.class | Bin 0 -> 4380 bytes .../classes/advanced/WNPEJSDriver.class | Bin 0 -> 4003 bytes .../advanced/WNPMapper$OutputData.class | Bin 0 -> 1011 bytes .../classes/advanced/WNPMapper$Weight.class | Bin 0 -> 984 bytes .../target/classes/advanced/WNPMapper.class | Bin 0 -> 5722 bytes .../target/classes/advanced/WNPReducer.class | Bin 0 -> 2348 bytes .../classes/blockingGraphBuilding/ARCS.class | Bin 0 -> 2167 bytes .../blockingGraphBuilding/ARCSDriver.class | Bin 0 -> 2984 bytes .../ARCSMapper$InputData.class | Bin 0 -> 1218 bytes .../blockingGraphBuilding/ARCSMapper.class | Bin 0 -> 2266 bytes .../ARCSReducer$OutputData.class | Bin 0 -> 1118 bytes .../blockingGraphBuilding/ARCSReducer.class | Bin 0 -> 3746 bytes .../ARCSReducerDirty$OutputData.class | Bin 0 -> 1158 bytes .../ARCSReducerDirty.class | Bin 0 -> 3758 bytes .../AllBlockComparisonsDriver.class | Bin 0 -> 4572 bytes .../AllBlockComparisonsDriverBalanced.class | Bin 0 -> 4771 bytes .../AllBlockComparisonsMapper$InputData.class | Bin 0 -> 1338 bytes .../AllBlockComparisonsMapper.class | Bin 0 -> 2777 bytes .../AllBlockComparisonsParitioner.class | Bin 0 -> 2952 bytes ...llBlockComparisonsReducer$OutputData.class | Bin 0 -> 1238 bytes .../AllBlockComparisonsReducer.class | Bin 0 -> 3912 bytes ...ckComparisonsReducerDirty$OutputData.class | Bin 0 -> 1278 bytes .../AllBlockComparisonsReducerDirty.class | Bin 0 -> 3766 bytes .../blockingGraphBuilding/BlockingGraph.class | Bin 0 -> 3750 bytes .../BlockingGraphARCS.class | Bin 0 -> 2993 bytes .../BlockingGraphEJS.class | Bin 0 -> 2948 bytes .../classes/blockingGraphBuilding/CBS.class | Bin 0 -> 2590 bytes .../classes/blockingGraphBuilding/ECBS.class | Bin 0 -> 3239 bytes .../classes/blockingGraphBuilding/EJS.class | Bin 0 -> 4173 bytes .../blockingGraphBuilding/EJSDriver.class | Bin 0 -> 2813 bytes .../blockingGraphBuilding/EJSMapper.class | Bin 0 -> 2333 bytes .../blockingGraphBuilding/EJSReducer.class | Bin 0 -> 4286 bytes .../classes/blockingGraphBuilding/JS.class | Bin 0 -> 2821 bytes .../blockingGraphBuilding/SumCombiner.class | Bin 0 -> 2134 bytes .../AverageWeightCombiner.class | Bin 0 -> 2183 bytes .../AverageWeightDriver.class | Bin 0 -> 2318 bytes .../AverageWeightMapper.class | Bin 0 -> 1965 bytes .../AverageWeightReducer.class | Bin 0 -> 3586 bytes .../blockingGraphPruning/CEPCombiner.class | Bin 0 -> 2310 bytes .../CEPCountingDriver.class | Bin 0 -> 3830 bytes .../blockingGraphPruning/CEPFinalDriver.class | Bin 0 -> 4209 bytes .../blockingGraphPruning/CEPFinalMapper.class | Bin 0 -> 2468 bytes .../CEPFinalMapperOnly.class | Bin 0 -> 2372 bytes .../CEPFinalReducer.class | Bin 0 -> 2677 bytes .../blockingGraphPruning/CEPMapper.class | Bin 0 -> 2005 bytes .../blockingGraphPruning/CEPReducer.class | Bin 0 -> 2726 bytes .../blockingGraphPruning/CEPTotalOrder.class | Bin 0 -> 3063 bytes .../blockingGraphPruning/CNP$Output.class | Bin 0 -> 1017 bytes .../classes/blockingGraphPruning/CNP.class | Bin 0 -> 3854 bytes .../blockingGraphPruning/CNPDriver.class | Bin 0 -> 4052 bytes .../blockingGraphPruning/NPMapper.class | Bin 0 -> 2692 bytes .../blockingGraphPruning/PCNPDriver.class | Bin 0 -> 2633 bytes .../blockingGraphPruning/PNPMapper.class | Bin 0 -> 2600 bytes .../blockingGraphPruning/PWNPDriver.class | Bin 0 -> 2633 bytes .../blockingGraphPruning/Reciprocal.class | Bin 0 -> 2147 bytes .../ReciprocalDriver.class | Bin 0 -> 2426 bytes .../classes/blockingGraphPruning/WEP.class | Bin 0 -> 3031 bytes .../blockingGraphPruning/WEPDriver.class | Bin 0 -> 2852 bytes .../blockingGraphPruning/WNP$Output.class | Bin 0 -> 1017 bytes .../classes/blockingGraphPruning/WNP.class | Bin 0 -> 4090 bytes .../blockingGraphPruning/WNPDriver.class | Bin 0 -> 2932 bytes .../entityBased/EntityBasedDriver.class | Bin 0 -> 4088 bytes .../EntityBasedDriverAverageWeight.class | Bin 0 -> 4999 bytes .../EntityBasedDriverAverageWeightARCS.class | Bin 0 -> 4597 bytes .../EntityBasedDriverAverageWeightEJS.class | Bin 0 -> 5922 bytes .../entityBased/EntityBasedDriverCEP1.class | Bin 0 -> 3592 bytes .../EntityBasedDriverCEP1ARCS.class | Bin 0 -> 3028 bytes .../EntityBasedDriverCEP1EJS.class | Bin 0 -> 4824 bytes .../entityBased/EntityBasedDriverCEP2.class | Bin 0 -> 4887 bytes .../entityBased/EntityBasedDriverCEP3.class | Bin 0 -> 4857 bytes .../EntityBasedDriverCEP3ARCS.class | Bin 0 -> 4432 bytes .../EntityBasedDriverCEP3EJS.class | Bin 0 -> 4785 bytes .../entityBased/EntityBasedDriverCNP.class | Bin 0 -> 4583 bytes .../EntityBasedDriverCNPARCS.class | Bin 0 -> 4158 bytes .../entityBased/EntityBasedDriverCNPEJS.class | Bin 0 -> 4888 bytes .../entityBased/EntityBasedDriverWEP.class | Bin 0 -> 4162 bytes .../EntityBasedDriverWEPARCS.class | Bin 0 -> 3756 bytes .../entityBased/EntityBasedDriverWEPEJS.class | Bin 0 -> 4753 bytes .../entityBased/EntityBasedDriverWNP.class | Bin 0 -> 3232 bytes .../EntityBasedDriverWNPARCS.class | Bin 0 -> 2679 bytes .../entityBased/EntityBasedDriverWNPEJS.class | Bin 0 -> 4464 bytes .../entityBased/EntityBasedIndexDriver.class | Bin 0 -> 2617 bytes .../entityBased/EntityBasedIndexMapper.class | Bin 0 -> 2414 bytes .../EntityBasedIndexReducer$OutputData.class | Bin 0 -> 1307 bytes .../entityBased/EntityBasedIndexReducer.class | Bin 0 -> 6346 bytes ...tyBasedIndexReducerMemory$OutputData.class | Bin 0 -> 1355 bytes .../EntityBasedIndexReducerMemory.class | Bin 0 -> 5441 bytes .../entityBased/EntityBasedMapper.class | Bin 0 -> 3337 bytes .../EntityBasedMapperFromCompressed.class | Bin 0 -> 2867 bytes .../EntityBasedMapperFromCompressedNP.class | Bin 0 -> 2763 bytes ...BasedMapperFromCompressedNPARCSClean.class | Bin 0 -> 3961 bytes ...tityBasedMapperFromCompressedNPClean.class | Bin 0 -> 3767 bytes .../entityBased/EntityBasedReducer.class | Bin 0 -> 2876 bytes ...tityBasedReducerAverageWeight$Weight.class | Bin 0 -> 1233 bytes .../EntityBasedReducerAverageWeight.class | Bin 0 -> 7005 bytes ...ReducerAverageWeightARCSClean$Weight.class | Bin 0 -> 1305 bytes ...tyBasedReducerAverageWeightARCSClean.class | Bin 0 -> 3804 bytes ...ReducerAverageWeightARCSDirty$Weight.class | Bin 0 -> 1305 bytes ...tyBasedReducerAverageWeightARCSDirty.class | Bin 0 -> 3809 bytes ...yBasedReducerAverageWeightEJS$Weight.class | Bin 0 -> 1257 bytes .../EntityBasedReducerAverageWeightEJS.class | Bin 0 -> 6076 bytes .../entityBased/EntityBasedReducerCEP.class | Bin 0 -> 6952 bytes .../EntityBasedReducerCEPARCSClean.class | Bin 0 -> 3734 bytes .../EntityBasedReducerCEPARCSDirty.class | Bin 0 -> 3732 bytes .../EntityBasedReducerCEPEJS.class | Bin 0 -> 6122 bytes .../EntityBasedReducerCEPFinal$Output.class | Bin 0 -> 1138 bytes .../EntityBasedReducerCEPFinal.class | Bin 0 -> 7081 bytes ...BasedReducerCEPFinalARCSClean$Output.class | Bin 0 -> 1210 bytes .../EntityBasedReducerCEPFinalARCSClean.class | Bin 0 -> 4231 bytes ...BasedReducerCEPFinalARCSDirty$Output.class | Bin 0 -> 1210 bytes .../EntityBasedReducerCEPFinalARCSDirty.class | Bin 0 -> 4229 bytes ...EntityBasedReducerCEPFinalEJS$Output.class | Bin 0 -> 1162 bytes .../EntityBasedReducerCEPFinalEJS.class | Bin 0 -> 6121 bytes .../EntityBasedReducerCNP$Output.class | Bin 0 -> 1098 bytes .../entityBased/EntityBasedReducerCNP.class | Bin 0 -> 7806 bytes ...ntityBasedReducerCNPARCSClean$Output.class | Bin 0 -> 1170 bytes .../EntityBasedReducerCNPARCSClean.class | Bin 0 -> 4530 bytes ...ntityBasedReducerCNPARCSDirty$Output.class | Bin 0 -> 1170 bytes .../EntityBasedReducerCNPARCSDirty.class | Bin 0 -> 4535 bytes .../EntityBasedReducerCNPEJS$Output.class | Bin 0 -> 1122 bytes .../EntityBasedReducerCNPEJS.class | Bin 0 -> 6781 bytes .../EntityBasedReducerWEP$Output.class | Bin 0 -> 1098 bytes .../entityBased/EntityBasedReducerWEP.class | Bin 0 -> 7103 bytes ...ntityBasedReducerWEPARCSClean$Output.class | Bin 0 -> 1170 bytes .../EntityBasedReducerWEPARCSClean.class | Bin 0 -> 4364 bytes ...ntityBasedReducerWEPARCSDirty$Output.class | Bin 0 -> 1170 bytes .../EntityBasedReducerWEPARCSDirty.class | Bin 0 -> 4369 bytes .../EntityBasedReducerWEPEJS$Output.class | Bin 0 -> 1122 bytes .../EntityBasedReducerWEPEJS.class | Bin 0 -> 6249 bytes .../EntityBasedReducerWNP$Output.class | Bin 0 -> 1098 bytes .../entityBased/EntityBasedReducerWNP.class | Bin 0 -> 7562 bytes ...ntityBasedReducerWNPARCSClean$Output.class | Bin 0 -> 1170 bytes .../EntityBasedReducerWNPARCSClean.class | Bin 0 -> 3947 bytes ...ntityBasedReducerWNPARCSDirty$Output.class | Bin 0 -> 1170 bytes .../EntityBasedReducerWNPARCSDirty.class | Bin 0 -> 3952 bytes .../EntityBasedReducerWNPEJS$Output.class | Bin 0 -> 1122 bytes .../EntityBasedReducerWNPEJS.class | Bin 0 -> 6211 bytes .../entityBased/NodeDegreeDriver.class | Bin 0 -> 4616 bytes .../NodeDegreeReducer$Output.class | Bin 0 -> 1070 bytes .../entityBased/NodeDegreeReducer.class | Bin 0 -> 3000 bytes .../DescendingDoubleComparator.class | Bin 0 -> 794 bytes .../DescendingVIntComparator.class | Bin 0 -> 784 bytes .../classes/hadoopUtils/InverseReducer.class | Bin 0 -> 2022 bytes .../hadoopUtils/MBTools$WeightingScheme.class | Bin 0 -> 1226 bytes .../target/classes/hadoopUtils/MBTools.class | Bin 0 -> 10421 bytes .../hadoopUtils/MapSortByValue$1.class | Bin 0 -> 1206 bytes .../classes/hadoopUtils/MapSortByValue.class | Bin 0 -> 1612 bytes .../classes/hadoopUtils/Partition.class | Bin 0 -> 1367 bytes .../hadoopUtils/PartitionComparator.class | Bin 0 -> 943 bytes .../classes/hadoopUtils/ReadHadoopStats.class | Bin 0 -> 2837 bytes .../RelativePositionCompression.class | Bin 0 -> 5501 bytes .../classes/hadoopUtils/ValueComparator.class | Bin 0 -> 1174 bytes ...BlockSizeByteCounterMapper$InputData.class | Bin 0 -> 1402 bytes ...rFilteringBlockSizeByteCounterMapper.class | Bin 0 -> 2532 bytes ...izeByteCounterReducer$OUTPUT_COUNTER.class | Bin 0 -> 1328 bytes ...FilteringBlockSizeByteCounterReducer.class | Bin 0 -> 4355 bytes ...teCounterReducerDirty$OUTPUT_COUNTER.class | Bin 0 -> 1368 bytes ...ringBlockSizeByteCounterReducerDirty.class | Bin 0 -> 3872 bytes ...ringBlockSizeCounterMapper$InputData.class | Bin 0 -> 1370 bytes ...AfterFilteringBlockSizeCounterMapper.class | Bin 0 -> 2519 bytes ...fterFilteringBlockSizeCounterReducer.class | Bin 0 -> 2284 bytes .../AfterFilteringByteCounter.class | Bin 0 -> 2259 bytes .../preprocessing/AfterFilteringCounter.class | Bin 0 -> 2074 bytes .../BasicEntityPruningCombiner.class | Bin 0 -> 3656 bytes .../BasicEntityPruningDriver.class | Bin 0 -> 2681 bytes .../BasicEntityPruningMapper$InputData.class | Bin 0 -> 1227 bytes .../BasicEntityPruningMapper.class | Bin 0 -> 3979 bytes .../BasicEntityPruningReducer.class | Bin 0 -> 2957 bytes .../BasicEntityPruningReducerNew.class | Bin 0 -> 3509 bytes .../BlockSizeCounterDriver.class | Bin 0 -> 1998 bytes ...ckSizeCounterMapper$InputComparisons.class | Bin 0 -> 1238 bytes .../BlockSizeCounterMapper.class | Bin 0 -> 3430 bytes .../BlocksFromEntityIndexDriver.class | Bin 0 -> 5516 bytes ...tyIndexDriverBalancedFixedPartitions.class | Bin 0 -> 9837 bytes .../BlocksFromEntityIndexDriverMaxBlock.class | Bin 0 -> 10230 bytes .../BlocksFromEntityIndexMapper.class | Bin 0 -> 2210 bytes .../BlocksFromEntityIndexParitioner.class | Bin 0 -> 2897 bytes ...cksFromEntityIndexReducer$OutputData.class | Bin 0 -> 1251 bytes .../BlocksFromEntityIndexReducer.class | Bin 0 -> 3282 bytes .../preprocessing/BlocksPerEntity.class | Bin 0 -> 3352 bytes .../preprocessing/BlocksPerEntityMapper.class | Bin 0 -> 1983 bytes .../classes/preprocessing/EJSDriver.class | Bin 0 -> 4294 bytes .../preprocessing/EJSFinalDriver.class | Bin 0 -> 2945 bytes .../preprocessing/EJSMapper$OutputData.class | Bin 0 -> 1168 bytes .../classes/preprocessing/EJSMapper.class | Bin 0 -> 5939 bytes .../classes/preprocessing/EJSReducer.class | Bin 0 -> 3236 bytes .../preprocessing/EntityIdsToIntDriver.class | Bin 0 -> 2300 bytes .../EntityIdsToIntMapper$InputData.class | Bin 0 -> 1286 bytes .../preprocessing/EntityIdsToIntMapper.class | Bin 0 -> 3429 bytes .../preprocessing/EntityIndexDriver.class | Bin 0 -> 5633 bytes .../preprocessing/EntityIndexDriverARCS.class | Bin 0 -> 5258 bytes .../preprocessing/EntityIndexMapper.class | Bin 0 -> 2403 bytes .../preprocessing/EntityIndexMapperARCS.class | Bin 0 -> 2814 bytes .../EntityIndexReducer$OutputData.class | Bin 0 -> 1275 bytes .../preprocessing/EntityIndexReducer.class | Bin 0 -> 5414 bytes .../EntityIndexReducerARCS$OutputData.class | Bin 0 -> 1307 bytes .../EntityIndexReducerARCS.class | Bin 0 -> 5881 bytes ...tyIndexReducerNoFiltering$OutputData.class | Bin 0 -> 1316 bytes .../EntityIndexReducerNoFiltering.class | Bin 0 -> 3163 bytes .../preprocessing/EntityPruningCombiner.class | Bin 0 -> 3015 bytes .../EntityPruningDirtyFinalMapper.class | Bin 0 -> 4816 bytes .../preprocessing/EntityPruningDriver.class | Bin 0 -> 2558 bytes .../EntityPruningFinalDriver.class | Bin 0 -> 5209 bytes .../EntityPruningFinalMapper.class | Bin 0 -> 5136 bytes .../EntityPruningFinalReducer.class | Bin 0 -> 2882 bytes .../EntityPruningMapper$InputData.class | Bin 0 -> 1123 bytes .../preprocessing/EntityPruningMapper.class | Bin 0 -> 4647 bytes .../EntityPruningMapperNew$InputData.class | Bin 0 -> 1211 bytes .../EntityPruningMapperNew.class | Bin 0 -> 4350 bytes .../preprocessing/EntityPruningReducer.class | Bin 0 -> 3119 bytes .../EntityPruningReducerNew.class | Bin 0 -> 2884 bytes .../preprocessing/ExtendedInputDriver.class | Bin 0 -> 5461 bytes .../preprocessing/ExtendedInputMapper.class | Bin 0 -> 2448 bytes .../ExtendedInputMapperARCS.class | Bin 0 -> 2482 bytes .../ExtendedInputReducer$OutputData.class | Bin 0 -> 1133 bytes .../preprocessing/ExtendedInputReducer.class | Bin 0 -> 3267 bytes .../ExtendedInputReducerNew$OutputData.class | Bin 0 -> 1157 bytes .../ExtendedInputReducerNew.class | Bin 0 -> 3528 bytes .../preprocessing/TextToSequence.class | Bin 0 -> 2471 bytes .../preprocessing/TextToSequenceMapper.class | Bin 0 -> 2373 bytes .../TextToSequenceMapperArrayWritable.class | Bin 0 -> 3155 bytes .../preprocessing/VIntArrayWritable.class | Bin 0 -> 2198 bytes .../target/maven-archiver/pom.properties | 4 + .../compile/default-compile/createdFiles.lst | 295 ++++++++++++++ .../compile/default-compile/inputFiles.lst | 208 ++++++++++ .../test-classes/.netbeans_automatic_build | 0 512 files changed, 18390 insertions(+) create mode 100644 MetaBlocking/nbactions.xml create mode 100644 MetaBlocking/pom.xml create mode 100644 MetaBlocking/src/main/java/advanced/AverageWeightCombiner.java create mode 100644 MetaBlocking/src/main/java/advanced/AverageWeightDriver.java create mode 100644 MetaBlocking/src/main/java/advanced/AverageWeightEJSDriver.java create mode 100644 MetaBlocking/src/main/java/advanced/AverageWeightEJSMapper.java create mode 100644 MetaBlocking/src/main/java/advanced/AverageWeightMapper.java create mode 100644 MetaBlocking/src/main/java/advanced/AverageWeightMapperNewFromCompressed.java create mode 100644 MetaBlocking/src/main/java/advanced/AverageWeightReducer.java create mode 100644 MetaBlocking/src/main/java/advanced/CEPCountingDriver.java create mode 100644 MetaBlocking/src/main/java/advanced/CEPCountingEJSDriver.java create mode 100644 MetaBlocking/src/main/java/advanced/CEPEJSMapper.java create mode 100644 MetaBlocking/src/main/java/advanced/CEPFinalDriver.java create mode 100644 MetaBlocking/src/main/java/advanced/CEPFinalEJSDriver.java create mode 100644 MetaBlocking/src/main/java/advanced/CEPFinalEJSMapper.java create mode 100644 MetaBlocking/src/main/java/advanced/CEPFinalEJSMapperOnly.java create mode 100644 MetaBlocking/src/main/java/advanced/CEPFinalMapper.java create mode 100644 MetaBlocking/src/main/java/advanced/CEPFinalMapperNew.java create mode 100644 MetaBlocking/src/main/java/advanced/CEPFinalMapperNewEJS.java create mode 100644 MetaBlocking/src/main/java/advanced/CEPFinalMapperOnly.java create mode 100644 MetaBlocking/src/main/java/advanced/CEPFinalMapperOnlyNew.java create mode 100644 MetaBlocking/src/main/java/advanced/CEPFinalMapperOnlyNewEJS.java create mode 100644 MetaBlocking/src/main/java/advanced/CEPMapper.java create mode 100644 MetaBlocking/src/main/java/advanced/CEPMapperNew.java create mode 100644 MetaBlocking/src/main/java/advanced/CEPMapperNewEJS.java create mode 100644 MetaBlocking/src/main/java/advanced/CNP.java create mode 100644 MetaBlocking/src/main/java/advanced/CNPDriver.java create mode 100644 MetaBlocking/src/main/java/advanced/CNPEJSDriver.java create mode 100644 MetaBlocking/src/main/java/advanced/NPMapper.java create mode 100644 MetaBlocking/src/main/java/advanced/NPMapperEJS.java create mode 100644 MetaBlocking/src/main/java/advanced/NPMapperFromCompressed.java create mode 100644 MetaBlocking/src/main/java/advanced/NPMapperNew.java create mode 100644 MetaBlocking/src/main/java/advanced/NPMapperNewFromCompressed.java create mode 100644 MetaBlocking/src/main/java/advanced/PCNPDriver.java create mode 100644 MetaBlocking/src/main/java/advanced/PNPMapper.java create mode 100644 MetaBlocking/src/main/java/advanced/PWNPDriver.java create mode 100644 MetaBlocking/src/main/java/advanced/WEPDriver.java create mode 100644 MetaBlocking/src/main/java/advanced/WEPMapper.java create mode 100644 MetaBlocking/src/main/java/advanced/WEPMapperOnly.java create mode 100644 MetaBlocking/src/main/java/advanced/WEPReducer.java create mode 100644 MetaBlocking/src/main/java/advanced/WNPDriver.java create mode 100644 MetaBlocking/src/main/java/advanced/WNPEJSDriver.java create mode 100644 MetaBlocking/src/main/java/advanced/WNPMapper.java create mode 100644 MetaBlocking/src/main/java/advanced/WNPReducer.java create mode 100644 MetaBlocking/src/main/java/blockingGraphBuilding/ARCS.java create mode 100644 MetaBlocking/src/main/java/blockingGraphBuilding/ARCSDriver.java create mode 100644 MetaBlocking/src/main/java/blockingGraphBuilding/ARCSMapper.java create mode 100644 MetaBlocking/src/main/java/blockingGraphBuilding/ARCSReducer.java create mode 100644 MetaBlocking/src/main/java/blockingGraphBuilding/ARCSReducerDirty.java create mode 100644 MetaBlocking/src/main/java/blockingGraphBuilding/AllBlockComparisonsDriver.java create mode 100644 MetaBlocking/src/main/java/blockingGraphBuilding/AllBlockComparisonsDriverBalanced.java create mode 100644 MetaBlocking/src/main/java/blockingGraphBuilding/AllBlockComparisonsDriverBalancedAdvanced.java create mode 100644 MetaBlocking/src/main/java/blockingGraphBuilding/AllBlockComparisonsMapper.java create mode 100644 MetaBlocking/src/main/java/blockingGraphBuilding/AllBlockComparisonsParitioner.java create mode 100644 MetaBlocking/src/main/java/blockingGraphBuilding/AllBlockComparisonsReducer.java create mode 100644 MetaBlocking/src/main/java/blockingGraphBuilding/AllBlockComparisonsReducerDirty.java create mode 100644 MetaBlocking/src/main/java/blockingGraphBuilding/BlockingGraph.java create mode 100644 MetaBlocking/src/main/java/blockingGraphBuilding/BlockingGraphARCS.java create mode 100644 MetaBlocking/src/main/java/blockingGraphBuilding/BlockingGraphEJS.java create mode 100644 MetaBlocking/src/main/java/blockingGraphBuilding/CBS.java create mode 100644 MetaBlocking/src/main/java/blockingGraphBuilding/ECBS.java create mode 100644 MetaBlocking/src/main/java/blockingGraphBuilding/EJS.java create mode 100644 MetaBlocking/src/main/java/blockingGraphBuilding/EJSDriver.java create mode 100644 MetaBlocking/src/main/java/blockingGraphBuilding/EJSMapper.java create mode 100644 MetaBlocking/src/main/java/blockingGraphBuilding/EJSReducer.java create mode 100644 MetaBlocking/src/main/java/blockingGraphBuilding/JS.java create mode 100644 MetaBlocking/src/main/java/blockingGraphBuilding/SumCombiner.java create mode 100644 MetaBlocking/src/main/java/blockingGraphPruning/AverageWeightCombiner.java create mode 100644 MetaBlocking/src/main/java/blockingGraphPruning/AverageWeightDriver.java create mode 100644 MetaBlocking/src/main/java/blockingGraphPruning/AverageWeightMapper.java create mode 100644 MetaBlocking/src/main/java/blockingGraphPruning/AverageWeightReducer.java create mode 100644 MetaBlocking/src/main/java/blockingGraphPruning/CEPCombiner.java create mode 100644 MetaBlocking/src/main/java/blockingGraphPruning/CEPCountingDriver.java create mode 100644 MetaBlocking/src/main/java/blockingGraphPruning/CEPFinalDriver.java create mode 100644 MetaBlocking/src/main/java/blockingGraphPruning/CEPFinalMapper.java create mode 100644 MetaBlocking/src/main/java/blockingGraphPruning/CEPFinalMapperOnly.java create mode 100644 MetaBlocking/src/main/java/blockingGraphPruning/CEPFinalReducer.java create mode 100644 MetaBlocking/src/main/java/blockingGraphPruning/CEPMapper.java create mode 100644 MetaBlocking/src/main/java/blockingGraphPruning/CEPReducer.java create mode 100644 MetaBlocking/src/main/java/blockingGraphPruning/CEPTotalOrder.java create mode 100644 MetaBlocking/src/main/java/blockingGraphPruning/CNP.java create mode 100644 MetaBlocking/src/main/java/blockingGraphPruning/CNPDriver.java create mode 100644 MetaBlocking/src/main/java/blockingGraphPruning/NPMapper.java create mode 100644 MetaBlocking/src/main/java/blockingGraphPruning/PCNPDriver.java create mode 100644 MetaBlocking/src/main/java/blockingGraphPruning/PNPMapper.java create mode 100644 MetaBlocking/src/main/java/blockingGraphPruning/PWNPDriver.java create mode 100644 MetaBlocking/src/main/java/blockingGraphPruning/Reciprocal.java create mode 100644 MetaBlocking/src/main/java/blockingGraphPruning/ReciprocalDriver.java create mode 100644 MetaBlocking/src/main/java/blockingGraphPruning/WEP.java create mode 100644 MetaBlocking/src/main/java/blockingGraphPruning/WEPDriver.java create mode 100644 MetaBlocking/src/main/java/blockingGraphPruning/WNP.java create mode 100644 MetaBlocking/src/main/java/blockingGraphPruning/WNPDriver.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedDriver.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedDriverAverageWeight.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedDriverAverageWeightARCS.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedDriverAverageWeightEJS.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedDriverCEP1.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedDriverCEP1ARCS.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedDriverCEP1EJS.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedDriverCEP2.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedDriverCEP3.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedDriverCEP3ARCS.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedDriverCEP3EJS.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedDriverCNP.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedDriverCNPARCS.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedDriverCNPEJS.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedDriverWEP.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedDriverWEPARCS.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedDriverWEPEJS.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedDriverWNP.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedDriverWNPARCS.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedDriverWNPEJS.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedIndexDriver.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedIndexMapper.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedIndexReducer.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedIndexReducerMemory.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedMapper.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedMapperFromCompressed.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedMapperFromCompressedNP.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedMapperFromCompressedNPARCSClean.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedMapperFromCompressedNPClean.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedReducer.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedReducerAverageWeight.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedReducerAverageWeightARCSClean.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedReducerAverageWeightARCSDirty.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedReducerAverageWeightEJS.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEP.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEPARCSClean.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEPARCSDirty.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEPEJS.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEPFinal.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEPFinalARCSClean.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEPFinalARCSDirty.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEPFinalEJS.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedReducerCNP.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedReducerCNPARCSClean.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedReducerCNPARCSDirty.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedReducerCNPEJS.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedReducerWEP.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedReducerWEPARCSClean.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedReducerWEPARCSDirty.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedReducerWEPEJS.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedReducerWNP.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedReducerWNPARCSClean.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedReducerWNPARCSDirty.java create mode 100644 MetaBlocking/src/main/java/entityBased/EntityBasedReducerWNPEJS.java create mode 100644 MetaBlocking/src/main/java/entityBased/NodeDegreeDriver.java create mode 100644 MetaBlocking/src/main/java/entityBased/NodeDegreeReducer.java create mode 100644 MetaBlocking/src/main/java/hadoopUtils/DescendingDoubleComparator.java create mode 100644 MetaBlocking/src/main/java/hadoopUtils/DescendingVIntComparator.java create mode 100644 MetaBlocking/src/main/java/hadoopUtils/InverseReducer.java create mode 100644 MetaBlocking/src/main/java/hadoopUtils/MBTools.java create mode 100644 MetaBlocking/src/main/java/hadoopUtils/MapSortByValue.java create mode 100644 MetaBlocking/src/main/java/hadoopUtils/Partition.java create mode 100644 MetaBlocking/src/main/java/hadoopUtils/PartitionComparator.java create mode 100644 MetaBlocking/src/main/java/hadoopUtils/ReadHadoopStats.java create mode 100644 MetaBlocking/src/main/java/hadoopUtils/RelativePositionCompression.java create mode 100644 MetaBlocking/src/main/java/hadoopUtils/ValueComparator.java create mode 100644 MetaBlocking/src/main/java/preprocessing/AfterFilteringBlockSizeByteCounterMapper.java create mode 100644 MetaBlocking/src/main/java/preprocessing/AfterFilteringBlockSizeByteCounterReducer.java create mode 100644 MetaBlocking/src/main/java/preprocessing/AfterFilteringBlockSizeByteCounterReducerDirty.java create mode 100644 MetaBlocking/src/main/java/preprocessing/AfterFilteringBlockSizeCounterMapper.java create mode 100644 MetaBlocking/src/main/java/preprocessing/AfterFilteringBlockSizeCounterReducer.java create mode 100644 MetaBlocking/src/main/java/preprocessing/AfterFilteringByteCounter.java create mode 100644 MetaBlocking/src/main/java/preprocessing/AfterFilteringCounter.java create mode 100644 MetaBlocking/src/main/java/preprocessing/BasicEntityPruningCombiner.java create mode 100644 MetaBlocking/src/main/java/preprocessing/BasicEntityPruningDriver.java create mode 100644 MetaBlocking/src/main/java/preprocessing/BasicEntityPruningMapper.java create mode 100644 MetaBlocking/src/main/java/preprocessing/BasicEntityPruningReducer.java create mode 100644 MetaBlocking/src/main/java/preprocessing/BasicEntityPruningReducerNew.java create mode 100644 MetaBlocking/src/main/java/preprocessing/BlockSizeCounterDriver.java create mode 100644 MetaBlocking/src/main/java/preprocessing/BlockSizeCounterMapper.java create mode 100644 MetaBlocking/src/main/java/preprocessing/BlocksFromEntityIndexDriver.java create mode 100644 MetaBlocking/src/main/java/preprocessing/BlocksFromEntityIndexDriverBalancedFixedPartitions.java create mode 100644 MetaBlocking/src/main/java/preprocessing/BlocksFromEntityIndexDriverMaxBlock.java create mode 100644 MetaBlocking/src/main/java/preprocessing/BlocksFromEntityIndexMapper.java create mode 100644 MetaBlocking/src/main/java/preprocessing/BlocksFromEntityIndexParitioner.java create mode 100644 MetaBlocking/src/main/java/preprocessing/BlocksFromEntityIndexReducer.java create mode 100644 MetaBlocking/src/main/java/preprocessing/BlocksPerEntity.java create mode 100644 MetaBlocking/src/main/java/preprocessing/EJSDriver.java create mode 100644 MetaBlocking/src/main/java/preprocessing/EJSFinalDriver.java create mode 100644 MetaBlocking/src/main/java/preprocessing/EJSMapper.java create mode 100644 MetaBlocking/src/main/java/preprocessing/EJSReducer.java create mode 100644 MetaBlocking/src/main/java/preprocessing/EntityIdsToIntDriver.java create mode 100644 MetaBlocking/src/main/java/preprocessing/EntityIdsToIntMapper.java create mode 100644 MetaBlocking/src/main/java/preprocessing/EntityIndexDriver.java create mode 100644 MetaBlocking/src/main/java/preprocessing/EntityIndexDriverARCS.java create mode 100644 MetaBlocking/src/main/java/preprocessing/EntityIndexMapper.java create mode 100644 MetaBlocking/src/main/java/preprocessing/EntityIndexMapperARCS.java create mode 100644 MetaBlocking/src/main/java/preprocessing/EntityIndexReducer.java create mode 100644 MetaBlocking/src/main/java/preprocessing/EntityIndexReducerARCS.java create mode 100644 MetaBlocking/src/main/java/preprocessing/EntityIndexReducerNoFiltering.java create mode 100644 MetaBlocking/src/main/java/preprocessing/EntityPruningCombiner.java create mode 100644 MetaBlocking/src/main/java/preprocessing/EntityPruningDirtyFinalMapper.java create mode 100644 MetaBlocking/src/main/java/preprocessing/EntityPruningDriver.java create mode 100644 MetaBlocking/src/main/java/preprocessing/EntityPruningFinalDriver.java create mode 100644 MetaBlocking/src/main/java/preprocessing/EntityPruningFinalMapper.java create mode 100644 MetaBlocking/src/main/java/preprocessing/EntityPruningFinalReducer.java create mode 100644 MetaBlocking/src/main/java/preprocessing/EntityPruningMapper.java create mode 100644 MetaBlocking/src/main/java/preprocessing/EntityPruningMapperNew.java create mode 100644 MetaBlocking/src/main/java/preprocessing/EntityPruningReducer.java create mode 100644 MetaBlocking/src/main/java/preprocessing/EntityPruningReducerNew.java create mode 100644 MetaBlocking/src/main/java/preprocessing/ExtendedInputDriver.java create mode 100644 MetaBlocking/src/main/java/preprocessing/ExtendedInputMapper.java create mode 100644 MetaBlocking/src/main/java/preprocessing/ExtendedInputMapperARCS.java create mode 100644 MetaBlocking/src/main/java/preprocessing/ExtendedInputReducer.java create mode 100644 MetaBlocking/src/main/java/preprocessing/ExtendedInputReducerNew.java create mode 100644 MetaBlocking/src/main/java/preprocessing/TextToSequence.java create mode 100644 MetaBlocking/src/main/java/preprocessing/TextToSequenceMapper.java create mode 100644 MetaBlocking/src/main/java/preprocessing/TextToSequenceMapperArrayWritable.java create mode 100644 MetaBlocking/src/main/java/preprocessing/VIntArrayWritable.java create mode 100644 MetaBlocking/target/MetaBlocking-1.0.jar create mode 100644 MetaBlocking/target/classes/.netbeans_automatic_build create mode 100644 MetaBlocking/target/classes/advanced/AverageWeightCombiner.class create mode 100644 MetaBlocking/target/classes/advanced/AverageWeightDriver.class create mode 100644 MetaBlocking/target/classes/advanced/AverageWeightEJSDriver.class create mode 100644 MetaBlocking/target/classes/advanced/AverageWeightEJSMapper$OutputData.class create mode 100644 MetaBlocking/target/classes/advanced/AverageWeightEJSMapper$Weight.class create mode 100644 MetaBlocking/target/classes/advanced/AverageWeightEJSMapper.class create mode 100644 MetaBlocking/target/classes/advanced/AverageWeightMapper$OutputData.class create mode 100644 MetaBlocking/target/classes/advanced/AverageWeightMapper$Weight.class create mode 100644 MetaBlocking/target/classes/advanced/AverageWeightMapper.class create mode 100644 MetaBlocking/target/classes/advanced/AverageWeightMapperNewFromCompressed$OutputData.class create mode 100644 MetaBlocking/target/classes/advanced/AverageWeightMapperNewFromCompressed$Weight.class create mode 100644 MetaBlocking/target/classes/advanced/AverageWeightMapperNewFromCompressed.class create mode 100644 MetaBlocking/target/classes/advanced/AverageWeightReducer.class create mode 100644 MetaBlocking/target/classes/advanced/CEPCountingDriver.class create mode 100644 MetaBlocking/target/classes/advanced/CEPCountingEJSDriver.class create mode 100644 MetaBlocking/target/classes/advanced/CEPEJSMapper$OutputData.class create mode 100644 MetaBlocking/target/classes/advanced/CEPEJSMapper.class create mode 100644 MetaBlocking/target/classes/advanced/CEPFinalDriver.class create mode 100644 MetaBlocking/target/classes/advanced/CEPFinalEJSDriver.class create mode 100644 MetaBlocking/target/classes/advanced/CEPFinalEJSMapper$OutputData.class create mode 100644 MetaBlocking/target/classes/advanced/CEPFinalEJSMapper.class create mode 100644 MetaBlocking/target/classes/advanced/CEPFinalEJSMapperOnly$OutputData.class create mode 100644 MetaBlocking/target/classes/advanced/CEPFinalEJSMapperOnly.class create mode 100644 MetaBlocking/target/classes/advanced/CEPFinalMapper$OutputData.class create mode 100644 MetaBlocking/target/classes/advanced/CEPFinalMapper.class create mode 100644 MetaBlocking/target/classes/advanced/CEPFinalMapperNew$OutputData.class create mode 100644 MetaBlocking/target/classes/advanced/CEPFinalMapperNew.class create mode 100644 MetaBlocking/target/classes/advanced/CEPFinalMapperNewEJS$OutputData.class create mode 100644 MetaBlocking/target/classes/advanced/CEPFinalMapperNewEJS.class create mode 100644 MetaBlocking/target/classes/advanced/CEPFinalMapperOnly$OutputData.class create mode 100644 MetaBlocking/target/classes/advanced/CEPFinalMapperOnly.class create mode 100644 MetaBlocking/target/classes/advanced/CEPFinalMapperOnlyNew$OutputData.class create mode 100644 MetaBlocking/target/classes/advanced/CEPFinalMapperOnlyNew.class create mode 100644 MetaBlocking/target/classes/advanced/CEPFinalMapperOnlyNewEJS$OutputData.class create mode 100644 MetaBlocking/target/classes/advanced/CEPFinalMapperOnlyNewEJS.class create mode 100644 MetaBlocking/target/classes/advanced/CEPMapper$OutputData.class create mode 100644 MetaBlocking/target/classes/advanced/CEPMapper.class create mode 100644 MetaBlocking/target/classes/advanced/CEPMapperNew$OutputData.class create mode 100644 MetaBlocking/target/classes/advanced/CEPMapperNew.class create mode 100644 MetaBlocking/target/classes/advanced/CEPMapperNewEJS$OutputData.class create mode 100644 MetaBlocking/target/classes/advanced/CEPMapperNewEJS.class create mode 100644 MetaBlocking/target/classes/advanced/CNP.class create mode 100644 MetaBlocking/target/classes/advanced/CNPDriver.class create mode 100644 MetaBlocking/target/classes/advanced/CNPEJSDriver.class create mode 100644 MetaBlocking/target/classes/advanced/NPMapper$InputData.class create mode 100644 MetaBlocking/target/classes/advanced/NPMapper$OutputData.class create mode 100644 MetaBlocking/target/classes/advanced/NPMapper$Weight.class create mode 100644 MetaBlocking/target/classes/advanced/NPMapper.class create mode 100644 MetaBlocking/target/classes/advanced/NPMapperEJS$OutputData.class create mode 100644 MetaBlocking/target/classes/advanced/NPMapperEJS.class create mode 100644 MetaBlocking/target/classes/advanced/NPMapperFromCompressed$OutputData.class create mode 100644 MetaBlocking/target/classes/advanced/NPMapperFromCompressed$Weight.class create mode 100644 MetaBlocking/target/classes/advanced/NPMapperFromCompressed.class create mode 100644 MetaBlocking/target/classes/advanced/NPMapperNew$InputData.class create mode 100644 MetaBlocking/target/classes/advanced/NPMapperNew$OutputData.class create mode 100644 MetaBlocking/target/classes/advanced/NPMapperNew$Weight.class create mode 100644 MetaBlocking/target/classes/advanced/NPMapperNew.class create mode 100644 MetaBlocking/target/classes/advanced/NPMapperNewFromCompressed$InputData.class create mode 100644 MetaBlocking/target/classes/advanced/NPMapperNewFromCompressed$OutputData.class create mode 100644 MetaBlocking/target/classes/advanced/NPMapperNewFromCompressed$Weight.class create mode 100644 MetaBlocking/target/classes/advanced/NPMapperNewFromCompressed.class create mode 100644 MetaBlocking/target/classes/advanced/PCNPDriver.class create mode 100644 MetaBlocking/target/classes/advanced/PNPMapper$OutputData.class create mode 100644 MetaBlocking/target/classes/advanced/PNPMapper$Weight.class create mode 100644 MetaBlocking/target/classes/advanced/PNPMapper.class create mode 100644 MetaBlocking/target/classes/advanced/PWNPDriver.class create mode 100644 MetaBlocking/target/classes/advanced/WEPDriver.class create mode 100644 MetaBlocking/target/classes/advanced/WEPMapper$OutputData.class create mode 100644 MetaBlocking/target/classes/advanced/WEPMapper$Weight.class create mode 100644 MetaBlocking/target/classes/advanced/WEPMapper.class create mode 100644 MetaBlocking/target/classes/advanced/WEPMapperOnly$OutputData.class create mode 100644 MetaBlocking/target/classes/advanced/WEPMapperOnly.class create mode 100644 MetaBlocking/target/classes/advanced/WEPReducer.class create mode 100644 MetaBlocking/target/classes/advanced/WNPDriver.class create mode 100644 MetaBlocking/target/classes/advanced/WNPEJSDriver.class create mode 100644 MetaBlocking/target/classes/advanced/WNPMapper$OutputData.class create mode 100644 MetaBlocking/target/classes/advanced/WNPMapper$Weight.class create mode 100644 MetaBlocking/target/classes/advanced/WNPMapper.class create mode 100644 MetaBlocking/target/classes/advanced/WNPReducer.class create mode 100644 MetaBlocking/target/classes/blockingGraphBuilding/ARCS.class create mode 100644 MetaBlocking/target/classes/blockingGraphBuilding/ARCSDriver.class create mode 100644 MetaBlocking/target/classes/blockingGraphBuilding/ARCSMapper$InputData.class create mode 100644 MetaBlocking/target/classes/blockingGraphBuilding/ARCSMapper.class create mode 100644 MetaBlocking/target/classes/blockingGraphBuilding/ARCSReducer$OutputData.class create mode 100644 MetaBlocking/target/classes/blockingGraphBuilding/ARCSReducer.class create mode 100644 MetaBlocking/target/classes/blockingGraphBuilding/ARCSReducerDirty$OutputData.class create mode 100644 MetaBlocking/target/classes/blockingGraphBuilding/ARCSReducerDirty.class create mode 100644 MetaBlocking/target/classes/blockingGraphBuilding/AllBlockComparisonsDriver.class create mode 100644 MetaBlocking/target/classes/blockingGraphBuilding/AllBlockComparisonsDriverBalanced.class create mode 100644 MetaBlocking/target/classes/blockingGraphBuilding/AllBlockComparisonsMapper$InputData.class create mode 100644 MetaBlocking/target/classes/blockingGraphBuilding/AllBlockComparisonsMapper.class create mode 100644 MetaBlocking/target/classes/blockingGraphBuilding/AllBlockComparisonsParitioner.class create mode 100644 MetaBlocking/target/classes/blockingGraphBuilding/AllBlockComparisonsReducer$OutputData.class create mode 100644 MetaBlocking/target/classes/blockingGraphBuilding/AllBlockComparisonsReducer.class create mode 100644 MetaBlocking/target/classes/blockingGraphBuilding/AllBlockComparisonsReducerDirty$OutputData.class create mode 100644 MetaBlocking/target/classes/blockingGraphBuilding/AllBlockComparisonsReducerDirty.class create mode 100644 MetaBlocking/target/classes/blockingGraphBuilding/BlockingGraph.class create mode 100644 MetaBlocking/target/classes/blockingGraphBuilding/BlockingGraphARCS.class create mode 100644 MetaBlocking/target/classes/blockingGraphBuilding/BlockingGraphEJS.class create mode 100644 MetaBlocking/target/classes/blockingGraphBuilding/CBS.class create mode 100644 MetaBlocking/target/classes/blockingGraphBuilding/ECBS.class create mode 100644 MetaBlocking/target/classes/blockingGraphBuilding/EJS.class create mode 100644 MetaBlocking/target/classes/blockingGraphBuilding/EJSDriver.class create mode 100644 MetaBlocking/target/classes/blockingGraphBuilding/EJSMapper.class create mode 100644 MetaBlocking/target/classes/blockingGraphBuilding/EJSReducer.class create mode 100644 MetaBlocking/target/classes/blockingGraphBuilding/JS.class create mode 100644 MetaBlocking/target/classes/blockingGraphBuilding/SumCombiner.class create mode 100644 MetaBlocking/target/classes/blockingGraphPruning/AverageWeightCombiner.class create mode 100644 MetaBlocking/target/classes/blockingGraphPruning/AverageWeightDriver.class create mode 100644 MetaBlocking/target/classes/blockingGraphPruning/AverageWeightMapper.class create mode 100644 MetaBlocking/target/classes/blockingGraphPruning/AverageWeightReducer.class create mode 100644 MetaBlocking/target/classes/blockingGraphPruning/CEPCombiner.class create mode 100644 MetaBlocking/target/classes/blockingGraphPruning/CEPCountingDriver.class create mode 100644 MetaBlocking/target/classes/blockingGraphPruning/CEPFinalDriver.class create mode 100644 MetaBlocking/target/classes/blockingGraphPruning/CEPFinalMapper.class create mode 100644 MetaBlocking/target/classes/blockingGraphPruning/CEPFinalMapperOnly.class create mode 100644 MetaBlocking/target/classes/blockingGraphPruning/CEPFinalReducer.class create mode 100644 MetaBlocking/target/classes/blockingGraphPruning/CEPMapper.class create mode 100644 MetaBlocking/target/classes/blockingGraphPruning/CEPReducer.class create mode 100644 MetaBlocking/target/classes/blockingGraphPruning/CEPTotalOrder.class create mode 100644 MetaBlocking/target/classes/blockingGraphPruning/CNP$Output.class create mode 100644 MetaBlocking/target/classes/blockingGraphPruning/CNP.class create mode 100644 MetaBlocking/target/classes/blockingGraphPruning/CNPDriver.class create mode 100644 MetaBlocking/target/classes/blockingGraphPruning/NPMapper.class create mode 100644 MetaBlocking/target/classes/blockingGraphPruning/PCNPDriver.class create mode 100644 MetaBlocking/target/classes/blockingGraphPruning/PNPMapper.class create mode 100644 MetaBlocking/target/classes/blockingGraphPruning/PWNPDriver.class create mode 100644 MetaBlocking/target/classes/blockingGraphPruning/Reciprocal.class create mode 100644 MetaBlocking/target/classes/blockingGraphPruning/ReciprocalDriver.class create mode 100644 MetaBlocking/target/classes/blockingGraphPruning/WEP.class create mode 100644 MetaBlocking/target/classes/blockingGraphPruning/WEPDriver.class create mode 100644 MetaBlocking/target/classes/blockingGraphPruning/WNP$Output.class create mode 100644 MetaBlocking/target/classes/blockingGraphPruning/WNP.class create mode 100644 MetaBlocking/target/classes/blockingGraphPruning/WNPDriver.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedDriver.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedDriverAverageWeight.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedDriverAverageWeightARCS.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedDriverAverageWeightEJS.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedDriverCEP1.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedDriverCEP1ARCS.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedDriverCEP1EJS.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedDriverCEP2.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedDriverCEP3.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedDriverCEP3ARCS.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedDriverCEP3EJS.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedDriverCNP.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedDriverCNPARCS.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedDriverCNPEJS.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedDriverWEP.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedDriverWEPARCS.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedDriverWEPEJS.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedDriverWNP.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedDriverWNPARCS.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedDriverWNPEJS.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedIndexDriver.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedIndexMapper.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedIndexReducer$OutputData.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedIndexReducer.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedIndexReducerMemory$OutputData.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedIndexReducerMemory.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedMapper.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedMapperFromCompressed.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedMapperFromCompressedNP.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedMapperFromCompressedNPARCSClean.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedMapperFromCompressedNPClean.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducer.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerAverageWeight$Weight.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerAverageWeight.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerAverageWeightARCSClean$Weight.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerAverageWeightARCSClean.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerAverageWeightARCSDirty$Weight.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerAverageWeightARCSDirty.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerAverageWeightEJS$Weight.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerAverageWeightEJS.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerCEP.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerCEPARCSClean.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerCEPARCSDirty.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerCEPEJS.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerCEPFinal$Output.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerCEPFinal.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerCEPFinalARCSClean$Output.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerCEPFinalARCSClean.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerCEPFinalARCSDirty$Output.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerCEPFinalARCSDirty.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerCEPFinalEJS$Output.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerCEPFinalEJS.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerCNP$Output.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerCNP.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerCNPARCSClean$Output.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerCNPARCSClean.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerCNPARCSDirty$Output.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerCNPARCSDirty.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerCNPEJS$Output.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerCNPEJS.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerWEP$Output.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerWEP.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerWEPARCSClean$Output.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerWEPARCSClean.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerWEPARCSDirty$Output.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerWEPARCSDirty.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerWEPEJS$Output.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerWEPEJS.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerWNP$Output.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerWNP.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerWNPARCSClean$Output.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerWNPARCSClean.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerWNPARCSDirty$Output.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerWNPARCSDirty.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerWNPEJS$Output.class create mode 100644 MetaBlocking/target/classes/entityBased/EntityBasedReducerWNPEJS.class create mode 100644 MetaBlocking/target/classes/entityBased/NodeDegreeDriver.class create mode 100644 MetaBlocking/target/classes/entityBased/NodeDegreeReducer$Output.class create mode 100644 MetaBlocking/target/classes/entityBased/NodeDegreeReducer.class create mode 100644 MetaBlocking/target/classes/hadoopUtils/DescendingDoubleComparator.class create mode 100644 MetaBlocking/target/classes/hadoopUtils/DescendingVIntComparator.class create mode 100644 MetaBlocking/target/classes/hadoopUtils/InverseReducer.class create mode 100644 MetaBlocking/target/classes/hadoopUtils/MBTools$WeightingScheme.class create mode 100644 MetaBlocking/target/classes/hadoopUtils/MBTools.class create mode 100644 MetaBlocking/target/classes/hadoopUtils/MapSortByValue$1.class create mode 100644 MetaBlocking/target/classes/hadoopUtils/MapSortByValue.class create mode 100644 MetaBlocking/target/classes/hadoopUtils/Partition.class create mode 100644 MetaBlocking/target/classes/hadoopUtils/PartitionComparator.class create mode 100644 MetaBlocking/target/classes/hadoopUtils/ReadHadoopStats.class create mode 100644 MetaBlocking/target/classes/hadoopUtils/RelativePositionCompression.class create mode 100644 MetaBlocking/target/classes/hadoopUtils/ValueComparator.class create mode 100644 MetaBlocking/target/classes/preprocessing/AfterFilteringBlockSizeByteCounterMapper$InputData.class create mode 100644 MetaBlocking/target/classes/preprocessing/AfterFilteringBlockSizeByteCounterMapper.class create mode 100644 MetaBlocking/target/classes/preprocessing/AfterFilteringBlockSizeByteCounterReducer$OUTPUT_COUNTER.class create mode 100644 MetaBlocking/target/classes/preprocessing/AfterFilteringBlockSizeByteCounterReducer.class create mode 100644 MetaBlocking/target/classes/preprocessing/AfterFilteringBlockSizeByteCounterReducerDirty$OUTPUT_COUNTER.class create mode 100644 MetaBlocking/target/classes/preprocessing/AfterFilteringBlockSizeByteCounterReducerDirty.class create mode 100644 MetaBlocking/target/classes/preprocessing/AfterFilteringBlockSizeCounterMapper$InputData.class create mode 100644 MetaBlocking/target/classes/preprocessing/AfterFilteringBlockSizeCounterMapper.class create mode 100644 MetaBlocking/target/classes/preprocessing/AfterFilteringBlockSizeCounterReducer.class create mode 100644 MetaBlocking/target/classes/preprocessing/AfterFilteringByteCounter.class create mode 100644 MetaBlocking/target/classes/preprocessing/AfterFilteringCounter.class create mode 100644 MetaBlocking/target/classes/preprocessing/BasicEntityPruningCombiner.class create mode 100644 MetaBlocking/target/classes/preprocessing/BasicEntityPruningDriver.class create mode 100644 MetaBlocking/target/classes/preprocessing/BasicEntityPruningMapper$InputData.class create mode 100644 MetaBlocking/target/classes/preprocessing/BasicEntityPruningMapper.class create mode 100644 MetaBlocking/target/classes/preprocessing/BasicEntityPruningReducer.class create mode 100644 MetaBlocking/target/classes/preprocessing/BasicEntityPruningReducerNew.class create mode 100644 MetaBlocking/target/classes/preprocessing/BlockSizeCounterDriver.class create mode 100644 MetaBlocking/target/classes/preprocessing/BlockSizeCounterMapper$InputComparisons.class create mode 100644 MetaBlocking/target/classes/preprocessing/BlockSizeCounterMapper.class create mode 100644 MetaBlocking/target/classes/preprocessing/BlocksFromEntityIndexDriver.class create mode 100644 MetaBlocking/target/classes/preprocessing/BlocksFromEntityIndexDriverBalancedFixedPartitions.class create mode 100644 MetaBlocking/target/classes/preprocessing/BlocksFromEntityIndexDriverMaxBlock.class create mode 100644 MetaBlocking/target/classes/preprocessing/BlocksFromEntityIndexMapper.class create mode 100644 MetaBlocking/target/classes/preprocessing/BlocksFromEntityIndexParitioner.class create mode 100644 MetaBlocking/target/classes/preprocessing/BlocksFromEntityIndexReducer$OutputData.class create mode 100644 MetaBlocking/target/classes/preprocessing/BlocksFromEntityIndexReducer.class create mode 100644 MetaBlocking/target/classes/preprocessing/BlocksPerEntity.class create mode 100644 MetaBlocking/target/classes/preprocessing/BlocksPerEntityMapper.class create mode 100644 MetaBlocking/target/classes/preprocessing/EJSDriver.class create mode 100644 MetaBlocking/target/classes/preprocessing/EJSFinalDriver.class create mode 100644 MetaBlocking/target/classes/preprocessing/EJSMapper$OutputData.class create mode 100644 MetaBlocking/target/classes/preprocessing/EJSMapper.class create mode 100644 MetaBlocking/target/classes/preprocessing/EJSReducer.class create mode 100644 MetaBlocking/target/classes/preprocessing/EntityIdsToIntDriver.class create mode 100644 MetaBlocking/target/classes/preprocessing/EntityIdsToIntMapper$InputData.class create mode 100644 MetaBlocking/target/classes/preprocessing/EntityIdsToIntMapper.class create mode 100644 MetaBlocking/target/classes/preprocessing/EntityIndexDriver.class create mode 100644 MetaBlocking/target/classes/preprocessing/EntityIndexDriverARCS.class create mode 100644 MetaBlocking/target/classes/preprocessing/EntityIndexMapper.class create mode 100644 MetaBlocking/target/classes/preprocessing/EntityIndexMapperARCS.class create mode 100644 MetaBlocking/target/classes/preprocessing/EntityIndexReducer$OutputData.class create mode 100644 MetaBlocking/target/classes/preprocessing/EntityIndexReducer.class create mode 100644 MetaBlocking/target/classes/preprocessing/EntityIndexReducerARCS$OutputData.class create mode 100644 MetaBlocking/target/classes/preprocessing/EntityIndexReducerARCS.class create mode 100644 MetaBlocking/target/classes/preprocessing/EntityIndexReducerNoFiltering$OutputData.class create mode 100644 MetaBlocking/target/classes/preprocessing/EntityIndexReducerNoFiltering.class create mode 100644 MetaBlocking/target/classes/preprocessing/EntityPruningCombiner.class create mode 100644 MetaBlocking/target/classes/preprocessing/EntityPruningDirtyFinalMapper.class create mode 100644 MetaBlocking/target/classes/preprocessing/EntityPruningDriver.class create mode 100644 MetaBlocking/target/classes/preprocessing/EntityPruningFinalDriver.class create mode 100644 MetaBlocking/target/classes/preprocessing/EntityPruningFinalMapper.class create mode 100644 MetaBlocking/target/classes/preprocessing/EntityPruningFinalReducer.class create mode 100644 MetaBlocking/target/classes/preprocessing/EntityPruningMapper$InputData.class create mode 100644 MetaBlocking/target/classes/preprocessing/EntityPruningMapper.class create mode 100644 MetaBlocking/target/classes/preprocessing/EntityPruningMapperNew$InputData.class create mode 100644 MetaBlocking/target/classes/preprocessing/EntityPruningMapperNew.class create mode 100644 MetaBlocking/target/classes/preprocessing/EntityPruningReducer.class create mode 100644 MetaBlocking/target/classes/preprocessing/EntityPruningReducerNew.class create mode 100644 MetaBlocking/target/classes/preprocessing/ExtendedInputDriver.class create mode 100644 MetaBlocking/target/classes/preprocessing/ExtendedInputMapper.class create mode 100644 MetaBlocking/target/classes/preprocessing/ExtendedInputMapperARCS.class create mode 100644 MetaBlocking/target/classes/preprocessing/ExtendedInputReducer$OutputData.class create mode 100644 MetaBlocking/target/classes/preprocessing/ExtendedInputReducer.class create mode 100644 MetaBlocking/target/classes/preprocessing/ExtendedInputReducerNew$OutputData.class create mode 100644 MetaBlocking/target/classes/preprocessing/ExtendedInputReducerNew.class create mode 100644 MetaBlocking/target/classes/preprocessing/TextToSequence.class create mode 100644 MetaBlocking/target/classes/preprocessing/TextToSequenceMapper.class create mode 100644 MetaBlocking/target/classes/preprocessing/TextToSequenceMapperArrayWritable.class create mode 100644 MetaBlocking/target/classes/preprocessing/VIntArrayWritable.class create mode 100644 MetaBlocking/target/maven-archiver/pom.properties create mode 100644 MetaBlocking/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst create mode 100644 MetaBlocking/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst create mode 100644 MetaBlocking/target/test-classes/.netbeans_automatic_build diff --git a/MetaBlocking/nbactions.xml b/MetaBlocking/nbactions.xml new file mode 100644 index 0000000..732a887 --- /dev/null +++ b/MetaBlocking/nbactions.xml @@ -0,0 +1,46 @@ + + + + run + + jar + + + process-classes + org.codehaus.mojo:exec-maven-plugin:1.2.1:exec + + + -classpath %classpath com.metablocking.preprocessing.BlocksFromEntityIndexDriverBalanced + java + + + + debug + + jar + + + process-classes + org.codehaus.mojo:exec-maven-plugin:1.2.1:exec + + + -Xdebug -Xrunjdwp:transport=dt_socket,server=n,address=${jpda.address} -classpath %classpath com.metablocking.preprocessing.BlocksFromEntityIndexDriverBalanced + java + true + + + + profile + + jar + + + process-classes + org.codehaus.mojo:exec-maven-plugin:1.2.1:exec + + + -classpath %classpath com.metablocking.preprocessing.BlocksFromEntityIndexDriverBalanced + java + + + diff --git a/MetaBlocking/pom.xml b/MetaBlocking/pom.xml new file mode 100644 index 0000000..e46db76 --- /dev/null +++ b/MetaBlocking/pom.xml @@ -0,0 +1,60 @@ + + + 4.0.0 + com.metablocking + MetaBlocking + 1.0 + jar + + UTF-8 + 1.7 + 1.7 + + + + + + org.apache.maven.plugins + maven-jar-plugin + 3.0.2 + + + + true + preprocessing.AfterFilteringByteCounter + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.5.1 + + 1.7 + 1.7 + + + blockingGraphBuilding/AllBlockComparisonsDriverBalancedAdvanced.java + + + + + + + + + + org.apache.hadoop + hadoop-core + 1.2.0 + + + blockingFramework + meta_blocking + 1.0 + compile + + + \ No newline at end of file diff --git a/MetaBlocking/src/main/java/advanced/AverageWeightCombiner.java b/MetaBlocking/src/main/java/advanced/AverageWeightCombiner.java new file mode 100644 index 0000000..3aeabc9 --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/AverageWeightCombiner.java @@ -0,0 +1,34 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import java.io.IOException; +import java.util.Iterator; + +import org.apache.hadoop.io.ByteWritable; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + + +public class AverageWeightCombiner extends MapReduceBase implements Reducer { + + /** + * identity mapper - just keep a counter to sum up weights + * @param key i,j entity ids + * @param value wij the weight of this edge + * @param output identical to intput (identity mapper) + */ + public void reduce(ByteWritable key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + double totalWeight = 0; + while (values.hasNext()) { + totalWeight += values.next().get(); + } + output.collect(key, new DoubleWritable(totalWeight)); + } + +} diff --git a/MetaBlocking/src/main/java/advanced/AverageWeightDriver.java b/MetaBlocking/src/main/java/advanced/AverageWeightDriver.java new file mode 100644 index 0000000..8115236 --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/AverageWeightDriver.java @@ -0,0 +1,123 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.mapred.Counters; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RunningJob; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + + +public class AverageWeightDriver extends Configured { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(advanced.AverageWeightDriver.class); + + conf.setJobName("Average Edge Weight using Extended Input"); //used for WEP + + conf.setOutputKeyClass(Text.class); + conf.setOutputValueClass(DoubleWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + conf.set("weightingScheme", args[0]); //one of: CBS, ECBS, JS, EJS, ARCS + FileInputFormat.setInputPaths(conf, new Path(args[1])); //Blocking Graph + FileOutputFormat.setOutputPath(conf, new Path(args[2])); //All unique comparisons with their weight + + conf.setMapperClass(advanced.AverageWeightMapperNewFromCompressed.class); + //conf.setCombinerClass(advanced.AverageWeightCombiner.class); + //conf.setReducerClass(advanced.AverageWeightReducer.class); + + conf.setNumReduceTasks(0); + + BufferedReader br2 = null, br3 = null; + try{ + Path cleanPath=new Path("/user/hduser/numBlocksClean.txt"); + Path dirtyPath=new Path("/user/hduser/numBlocksDirty.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + br2=new BufferedReader(new InputStreamReader(fs.open(cleanPath))); + Integer cleanBlocks = Integer.parseInt(br2.readLine()); + conf.setInt("cleanBlocks", cleanBlocks); + br3=new BufferedReader(new InputStreamReader(fs.open(dirtyPath))); + Integer dirtyBlocks = Integer.parseInt(br3.readLine()); + conf.setInt("dirtyBlocks", dirtyBlocks); + + if (args[0].equals("EJS")) { + Path pt2= new Path("/user/hduser/validComparisons.txt"); + br2=new BufferedReader(new InputStreamReader(fs.open(pt2))); + String validComparisons = br2.readLine(); + conf.set("validComparisons", validComparisons); + } + + }catch(Exception e){ + System.err.println(e.toString()); + } finally { + try { br2.close();br3.close(); } + catch (IOException e) {System.err.println(e.toString());} + } + + + + +// conf.setCompressMapOutput(true); + conf.set("mapred.max.tracker.failures", "100"); //before it gets black-listed + conf.set("mapred.job.tracker.handler.count", "40"); + conf.setInt("mapred.task.timeout", 10000000); //before the non-reporting task fails + + + + + client.setConf(conf); + RunningJob job = null; + try { + job = JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + + //the following is used only for CNP,CEPTotalOrder but does not create any overhead (keep it always) + if (job == null) { + System.err.println("No job found"); + return; + } + + try { + Counters counters = job.getCounters(); + double totalWeight = counters.findCounter(advanced.AverageWeightMapperNewFromCompressed.Weight.WEIGHT_COUNTER).getCounter() / 1000.0; + long comparisons = counters.findCounter("org.apache.hadoop.mapred.Task$Counter", + "MAP_OUTPUT_RECORDS").getCounter(); + Double averageWeight = totalWeight / comparisons; + Path pt=new Path("/user/hduser/averageWeight.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + BufferedWriter br=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true))); + br.write(averageWeight.toString()); + br.close(); + } catch (IllegalArgumentException | IOException e) { + System.err.println(e.toString()); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/advanced/AverageWeightEJSDriver.java b/MetaBlocking/src/main/java/advanced/AverageWeightEJSDriver.java new file mode 100644 index 0000000..9f28069 --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/AverageWeightEJSDriver.java @@ -0,0 +1,99 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.mapred.Counters; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RunningJob; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + + +public class AverageWeightEJSDriver extends Configured { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(advanced.AverageWeightEJSDriver.class); + + conf.setJobName("Average Edge Weight using Extended Input"); //used for WEP + + conf.setOutputKeyClass(Text.class); + conf.setOutputValueClass(DoubleWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //EJSFinal + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //All unique comparisons with their weight + + conf.setMapperClass(advanced.AverageWeightEJSMapper.class); + //conf.setCombinerClass(advanced.AverageWeightCombiner.class); + //conf.setReducerClass(advanced.AverageWeightReducer.class); + + conf.setNumReduceTasks(0); + + BufferedReader br = null; + try{ + Path pt= new Path("/user/hduser/validComparisons.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + br=new BufferedReader(new InputStreamReader(fs.open(pt))); + String validComparisons = br.readLine(); + conf.set("validComparisons", validComparisons); + }catch(Exception e){ + System.err.println(e.toString()); + } finally { + try { br.close(); } + catch (IOException e) {System.err.println(e.toString());} + } + + client.setConf(conf); + RunningJob job = null; + try { + job = JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + + //the following is used only for CNP,CEPTotalOrder but does not create any overhead (keep it always) + if (job == null) { + System.err.println("No job found"); + return; + } + + try { + Counters counters = job.getCounters(); + double totalWeight = counters.findCounter(advanced.AverageWeightEJSMapper.Weight.WEIGHT_COUNTER).getCounter() / 1000.0; + long comparisons = counters.findCounter("org.apache.hadoop.mapred.Task$Counter", + "MAP_OUTPUT_RECORDS").getCounter(); + Double averageWeight = (double) totalWeight / comparisons; + Path pt=new Path("/user/hduser/averageWeight.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + BufferedWriter bw=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true))); + bw.write(averageWeight.toString()); + bw.close(); + } catch (IllegalArgumentException | IOException e) { + System.err.println(e.toString()); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/advanced/AverageWeightEJSMapper.java b/MetaBlocking/src/main/java/advanced/AverageWeightEJSMapper.java new file mode 100644 index 0000000..9d354ae --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/AverageWeightEJSMapper.java @@ -0,0 +1,151 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import hadoopUtils.MBTools; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +public class AverageWeightEJSMapper extends MapReduceBase implements Mapper { + + public enum Weight {WEIGHT_COUNTER}; + public enum OutputData {PURGED_BLOCKS}; + + private Long validComparisons; + private Text keyToEmit = new Text(); + private DoubleWritable valueToEmit = new DoubleWritable(); + + + public void configure (JobConf job) { + validComparisons = Long.parseLong(job.get("validComparisons")); + } + + + /** + * input: a blocking collection + * @param key block id + * @param value arrays of entity ids in this block (first element), along with the block ids (sorted) that contain them (remaining elements) + * e.g. [1,7,8,9],[3,1,8,10] means that in this block belong the entities 1 and 3 and entity 1 is placed in blocks 7,8,9 (sorted) and + * entity 3 is placed in blocks 1,8,10 + * @param output key: entity id (each of the input values). value: entity ids separated by " " (neighbors of output key) + */ + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + reporter.setStatus("splitting the block "+key); + + Map> entityIndex = new TreeMap<>(); //key is entity id, value is the list of blocks that contain the key + List blocks; + String[] entityIndices = value.toString().split("]"); //each entityIndex is an array with the first element the entity and the rest elements its blocks + for (String tmpEntityIndex : entityIndices) { + if (tmpEntityIndex == null || tmpEntityIndex.length() < 2) {continue;} + tmpEntityIndex = tmpEntityIndex.substring(1); //to remove the initial '[' + String[] idsArray = tmpEntityIndex.split(", "); + int entityId = Integer.parseInt(idsArray[0]); + blocks = new ArrayList<>(idsArray.length-1); //maybe initial capacity is not needed + for (int i=1; i < idsArray.length; ++i) { + blocks.add(Integer.parseInt(idsArray[i])); + } + entityIndex.put(entityId, blocks); + } + + //dirty ER + List entities = new ArrayList<>(entityIndex.keySet()); + if (entities.size() < 2) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + } + + + + //clean-clean ER + /*List D1entities = new ArrayList<>(); + List D2entities = new ArrayList<>(); + for (int entity : entityIndex.keySet()) { + if (entity < 0) { + D2entities.add(entity); + } else { + D1entities.add(entity); + } + } + if (D1entities.isEmpty() || D2entities.isEmpty()) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + }*/ + + + + //clean-clean ER + /*int blockId = key.get(); + List allValuesE1; + List allValuesE2; + List blockse1; + List blockse2; + int counter = 0; + int D1size = D1entities.size(); + + for (int e1 : D1entities) { + reporter.setStatus(++counter+"/"+D1size); + allValuesE1 = entityIndex.get(e1); //the last value is the cardinality of e1 + blockse1 = allValuesE1.subList(0, allValuesE1.size()-1); //the last value is the cardinality of e1 + for (int e2 : D2entities) { + allValuesE2 = entityIndex.get(e2); //the last value is the cardinality of e2 + blockse2 = allValuesE2.subList(0, allValuesE2.size()-1); //the last value is the cardinality of e2 + + if (!MBTools.isRepeated(blockse1, blockse2, blockId)) { + Double weight = MBTools.getWeight(allValuesE1, allValuesE2, blockId, "EJS", 0, validComparisons); + reporter.incrCounter(Weight.WEIGHT_COUNTER, new Double(weight*1000).longValue()); + keyToEmit.set(e1+","+e2); + valueToEmit.set(weight); + output.collect(keyToEmit, valueToEmit); + } + } + }*/ + + //dirty ER + int blockId = key.get(); + List allValuesE1; + List allValuesE2; + List blockse1; + List blockse2; + int counter = 0; + Integer []entitiesArray = new Integer[entities.size()]; + entitiesArray = entities.toArray(entitiesArray); + int blockSize = entitiesArray.length; + + for (int i = 0; i < blockSize-1; ++i) { + int e1 = entitiesArray[i]; + reporter.setStatus(++counter+"/"+blockSize); + allValuesE1 = entityIndex.get(e1); //the last value is the cardinality of e1 + blockse1 = allValuesE1.subList(0, allValuesE1.size()-1); //the last value is the cardinality of e1 + for (int j = i+1; j < blockSize; ++j) { + int e2 = entitiesArray[j]; + allValuesE2 = entityIndex.get(e2); //the last value is the cardinality of e2 + blockse2 = allValuesE2.subList(0, allValuesE2.size()-1); //the last value is the cardinality of e2 + + if (!MBTools.isRepeated(blockse1, blockse2, blockId)) { + Double weight = MBTools.getWeight(allValuesE1, allValuesE2, blockId, "EJS", 0, validComparisons); + reporter.incrCounter(Weight.WEIGHT_COUNTER, new Double(weight*1000).longValue()); + keyToEmit.set(e1+","+e2); + valueToEmit.set(weight); + output.collect(keyToEmit, valueToEmit); + } + } + } + + } + + +} diff --git a/MetaBlocking/src/main/java/advanced/AverageWeightMapper.java b/MetaBlocking/src/main/java/advanced/AverageWeightMapper.java new file mode 100644 index 0000000..5e12df9 --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/AverageWeightMapper.java @@ -0,0 +1,151 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import hadoopUtils.MBTools; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +public class AverageWeightMapper extends MapReduceBase implements Mapper { + + public enum Weight {WEIGHT_COUNTER}; + public enum OutputData {PURGED_BLOCKS, COMPARISONS}; + + private String weightingScheme; + private Text keyToEmit = new Text(); + private DoubleWritable valueToEmit = new DoubleWritable(); + + private int cleanBlocks; + private int dirtyBlocks; + + public void configure (JobConf job) { + weightingScheme = job.get("weightingScheme"); //one of ARCS,CBS,ECBS,JS,EJS + cleanBlocks = job.getInt("cleanBlocks", 0); + dirtyBlocks = job.getInt("dirtyBlocks", 0); + } + + + /** + * input: a blocking collection + * @param key block id + * @param value arrays of entity ids in this block (first element), along with the block ids (sorted) that contain them (remaining elements) + * e.g. [1,7,8,9],[3,1,8,10] means that in this block belong the entities 1 and 3 and entity 1 is placed in blocks 7,8,9 (sorted) and + * entity 3 is placed in blocks 1,8,10 + * @param output key: entity id (each of the input values). value: entity ids separated by " " (neighbors of output key) + */ + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + reporter.setStatus("splitting the block "+key); + + Map> entityIndex = new TreeMap<>(); //key is entity id, value is the list of blocks that contain the key + List blocks; + String[] entityIndices = value.toString().split("]"); //each entityIndex is an array with the first element the entity and the rest elements its blocks + for (String tmpEntityIndex : entityIndices) { + if (tmpEntityIndex == null || tmpEntityIndex.length() < 2) {continue;} + tmpEntityIndex = tmpEntityIndex.substring(1); //to remove the initial '[' + String[] idsArray = tmpEntityIndex.split(", "); + int entityId = Integer.parseInt(idsArray[0]); + blocks = new ArrayList<>(idsArray.length-1); //maybe initial capacity is not needed + for (int i=1; i < idsArray.length; ++i) { + blocks.add(Integer.parseInt(idsArray[i])); + } + entityIndex.put(entityId, blocks); + } + + //dirty ER + List entities = new ArrayList<>(entityIndex.keySet()); + if (entities.size() < 2) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + } + + + + //clean-clean ER + /*List D1entities = new ArrayList<>(); + List D2entities = new ArrayList<>(); + for (int entity : entityIndex.keySet()) { + if (entity < 0) { + D2entities.add(entity); + } else { + D1entities.add(entity); + } + } + if (D1entities.isEmpty() || D2entities.isEmpty()) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + }*/ + + + + //clean-clean ER + /*int blockId = key.get(); + List blockse1; + List blockse2; + int counter = 0; + int D1size = D1entities.size(); + + for (int e1 : D1entities) { + reporter.setStatus(++counter+"/"+D1size); + blockse1 = entityIndex.get(e1); + for (int e2 : D2entities) { + blockse2 = entityIndex.get(e2); + + reporter.incrCounter(OutputData.COMPARISONS, 1); + + if (!MBTools.isRepeated(blockse1, blockse2, blockId, weightingScheme)) { + Double weight = MBTools.getWeight(blockse1, blockse2, blockId, weightingScheme, cleanBlocks, 0); + reporter.incrCounter(Weight.WEIGHT_COUNTER, new Double(weight*1000).longValue()); + keyToEmit.set(e1+","+e2); + valueToEmit.set(weight); + output.collect(keyToEmit, valueToEmit); + } + } + }*/ + + //dirty ER + int blockId = key.get(); + List blockse1; + List blockse2; + int counter = 0; + Integer []entitiesArray = new Integer[entities.size()]; + entitiesArray = entities.toArray(entitiesArray); + int blockSize = entitiesArray.length; + + for (int i = 0; i < blockSize-1; ++i) { + int e1 = entitiesArray[i]; + reporter.setStatus(++counter+"/"+blockSize); + blockse1 = entityIndex.get(e1); + for (int j = i+1; j < blockSize; ++j) { + + reporter.incrCounter(OutputData.COMPARISONS, 1); + + int e2 = entitiesArray[j]; + blockse2 = entityIndex.get(e2); + if (!MBTools.isRepeated(blockse1, blockse2, blockId)) { + Double weight = MBTools.getWeight(blockse1, blockse2, blockId, weightingScheme, dirtyBlocks); + reporter.incrCounter(Weight.WEIGHT_COUNTER, new Double(weight*1000).longValue()); + keyToEmit.set(e1+","+e2); + valueToEmit.set(weight); + output.collect(keyToEmit, valueToEmit); + } + } + } + } + +} + + diff --git a/MetaBlocking/src/main/java/advanced/AverageWeightMapperNewFromCompressed.java b/MetaBlocking/src/main/java/advanced/AverageWeightMapperNewFromCompressed.java new file mode 100644 index 0000000..61b82de --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/AverageWeightMapperNewFromCompressed.java @@ -0,0 +1,129 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import hadoopUtils.MBTools; + +import java.io.IOException; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +public class AverageWeightMapperNewFromCompressed extends MapReduceBase implements Mapper { + + public enum Weight {WEIGHT_COUNTER}; + public enum OutputData {PURGED_BLOCKS, COMPARISONS}; + + private String weightingScheme; + private Text keyToEmit = new Text(); + private DoubleWritable valueToEmit = new DoubleWritable(); + + private int cleanBlocks; + private int dirtyBlocks; + private long validComparisons; + + public void configure (JobConf job) { + weightingScheme = job.get("weightingScheme"); //one of ARCS,CBS,ECBS,JS,EJS + cleanBlocks = job.getInt("cleanBlocks", 0); + dirtyBlocks = job.getInt("dirtyBlocks", 0); + validComparisons = Long.parseLong(job.get("validComparisons","0")); + } + + + /** + * input: a blocking collection + * @param key block id + * @param value arrays of entity ids in this block (first element), along with the block ids (sorted) that contain them (remaining elements) + * e.g. [1,7,8,9],[3,1,8,10] means that in this block belong the entities 1 and 3 and entity 1 is placed in blocks 7,8,9 (sorted) and + * entity 3 is placed in blocks 1,8,10 + * @param output key: entity id (each of the input values). value: entity ids separated by " " (neighbors of output key) + */ + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + reporter.setStatus("splitting the block "+key); + + String[] entityIndices = value.toString().split("]"); //each entityIndex is an array with the first element the entity and the rest elements its blocks + + int counter = 0; + int noOfEntities = entityIndices.length; + //dirty ER +// if (noOfEntities < 2) { //TODO:is it correct ?????? +// reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); +// return; +// } + + boolean containsPositive = false; //clean-clean (in case this check is NOT made in ExtendeInput Reducer) + boolean containsNegative = false; //clean-clean (in case this check is NOT made in ExtendeInput Reducer) + + int[] entityIds = new int[noOfEntities]; //the ids of entities contained in this block + int[][] entityBlocks = new int[noOfEntities][]; //the blocks of each entity + for (String tmpEntityIndex : entityIndices) { + //if (tmpEntityIndex == null || tmpEntityIndex.length() < 2) {continue;} GINETAI NA BGALOUME AUTO TON ELEGXO? + String[] idsArray = tmpEntityIndex.split(", "); + entityIds[counter] = Integer.parseInt(idsArray[0].substring(1)); //first is entity id + + if (entityIds[counter] >= 0) containsPositive = true; else containsNegative = true; //clean-clean (in case this check is NOT made in ExtendeInput Reducer) + + int noOfBlocks = idsArray.length-1; + entityBlocks[counter] = new int[noOfBlocks]; + for (int i=0; i < noOfBlocks; ++i) { + entityBlocks[counter][i] = Integer.parseInt(idsArray[i+1]); //then are the blocks of this entity + } + counter++; + } + + + //dirty ER + /*int blockId = key.get(); + for (int i = 0; i < noOfEntities-1; ++i) { + int e1 = entityIds[i]; + reporter.setStatus(++counter+"/"+noOfEntities); + for (int j = i+1; j < noOfEntities; ++j) { + reporter.incrCounter(OutputData.COMPARISONS, 1); + double weight = MBTools.getWeight(blockId, entityBlocks[i], entityBlocks[j], weightingScheme, dirtyBlocks, validComparisons); + if (weight > 0) { + reporter.incrCounter(Weight.WEIGHT_COUNTER, new Double(weight*1000).longValue()); + keyToEmit.set(e1+","+entityIds[j]); + valueToEmit.set(weight); + output.collect(keyToEmit, valueToEmit); + } + } + }*/ + + //clean-clean ER + if (!(containsNegative && containsPositive)) { //unless this check is made in ExtendeInput Reducer + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; //no comparisons from this block + } + + int blockId = key.get(); + for (int i = 0; i < noOfEntities-1; ++i) { + int e1 = entityIds[i]; + reporter.setStatus(++counter+"/"+noOfEntities); + for (int j = i+1; j < noOfEntities; ++j) { + int e2 = entityIds[j]; + if ((e1 ^ e2) >> 31 == 0) { //equal sign bit? (30% faster than if ((e1 > 0 && e2 > 0) || (e1 < 0 && e2 < 0)) ) + continue; + } + reporter.incrCounter(OutputData.COMPARISONS, 1); + double weight = MBTools.getWeight(blockId, entityBlocks[i], entityBlocks[j], weightingScheme, cleanBlocks, validComparisons); + if (weight > 0) { + reporter.incrCounter(Weight.WEIGHT_COUNTER, new Double(weight*1000).longValue()); + keyToEmit.set(e1+","+entityIds[j]); + valueToEmit.set(weight); + output.collect(keyToEmit, valueToEmit); + } + } + } + + } + +} + + \ No newline at end of file diff --git a/MetaBlocking/src/main/java/advanced/AverageWeightReducer.java b/MetaBlocking/src/main/java/advanced/AverageWeightReducer.java new file mode 100644 index 0000000..6449e77 --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/AverageWeightReducer.java @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import java.io.IOException; +import java.util.Iterator; + +import org.apache.hadoop.io.ByteWritable; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.JobID; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.RunningJob; + +public class AverageWeightReducer extends MapReduceBase implements Reducer { + + long totalPairs; + public void configure(JobConf conf) { + try { + JobClient client = new JobClient(conf); + RunningJob parentJob = client.getJob(JobID.forName(conf.get("mapred.job.id"))); + totalPairs = parentJob.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", + "MAP_OUTPUT_RECORDS").getCounter(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + /** + * identity mapper - just keep a counter to sum up weights + * @param key i,j entity ids + * @param value wij the weight of this edge + * @param output identical to input (identity mapper) + */ + public void reduce(ByteWritable key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + double totalWeight = 0; + while (values.hasNext()) { + totalWeight += values.next().get(); + } + output.collect(new DoubleWritable(totalWeight/totalPairs), NullWritable.get()); + } + +} diff --git a/MetaBlocking/src/main/java/advanced/CEPCountingDriver.java b/MetaBlocking/src/main/java/advanced/CEPCountingDriver.java new file mode 100644 index 0000000..4053a94 --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/CEPCountingDriver.java @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.TextOutputFormat; + + +public class CEPCountingDriver extends Configured { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(advanced.CEPCountingDriver.class); + + conf.setJobName("CEP Counting using Extended Input"); //used for CEP + + conf.setMapOutputKeyClass(DoubleWritable.class); + conf.setMapOutputValueClass(VIntWritable.class); + + conf.setOutputKeyClass(DoubleWritable.class); + conf.setOutputValueClass(NullWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(TextOutputFormat.class); + + conf.setOutputKeyComparatorClass(hadoopUtils.DescendingDoubleComparator.class); //sort doubles in descending order + + if (!args[0].equals("ARCS") && !args[0].equals("CBS") && !args[0].equals("JS") + && !args[0].equals("EJS") && !args[0].equals("ECBS")) { + System.err.println(args[0] +"is not a valid weighting scheme!"); + System.exit(0); + } + + conf.set("weightingScheme", args[0]); //one of: CBS, ECBS, JS, EJS, ARCS + + FileInputFormat.setInputPaths(conf, new Path(args[1])); //Extended Input + FileOutputFormat.setOutputPath(conf, new Path(args[2])); //minValue and extra (more than k) elements + + conf.setMapperClass(advanced.CEPMapperNew.class); + conf.setCombinerClass(blockingGraphPruning.CEPCombiner.class); + conf.setReducerClass(blockingGraphPruning.CEPReducer.class); + + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + conf.setInt("mapred.task.timeout", 10000000); + conf.setNumReduceTasks(1); + + conf.setCompressMapOutput(true); + + BufferedReader br = null, br2 = null, br3 = null; + try { + Path pt=new Path("/user/hduser/CEPk.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + br=new BufferedReader(new InputStreamReader(fs.open(pt))); + Integer K = Integer.parseInt(br.readLine()); + br.close(); + conf.setInt("K", K); + System.out.println("K="+K); + + Path cleanPath=new Path("/user/hduser/numBlocksClean.txt"); + Path dirtyPath=new Path("/user/hduser/numBlocksDirty.txt"); + br2=new BufferedReader(new InputStreamReader(fs.open(cleanPath))); + Integer cleanBlocks = Integer.parseInt(br2.readLine()); + conf.setInt("cleanBlocks", cleanBlocks); + br3=new BufferedReader(new InputStreamReader(fs.open(dirtyPath))); + Integer dirtyBlocks = Integer.parseInt(br3.readLine()); + conf.setInt("dirtyBlocks", dirtyBlocks); + + if (args[0].equals("EJS")) { + Path pt2= new Path("/user/hduser/validComparisons.txt"); + br2=new BufferedReader(new InputStreamReader(fs.open(pt2))); + String validComparisons = br2.readLine(); + conf.set("validComparisons", validComparisons); + } + + } catch(Exception e){ + System.err.println(e.toString()); + } finally { + try { br.close(); br2.close();br3.close(); } + catch (IOException e) {System.err.println(e.toString());} + } + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/advanced/CEPCountingEJSDriver.java b/MetaBlocking/src/main/java/advanced/CEPCountingEJSDriver.java new file mode 100644 index 0000000..6af70bb --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/CEPCountingEJSDriver.java @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.TextOutputFormat; + + +public class CEPCountingEJSDriver extends Configured { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(advanced.CEPCountingDriver.class); + + conf.setJobName("CEP Counting using Extended Input (EJS)"); //used for CEP + + conf.setMapOutputKeyClass(DoubleWritable.class); + conf.setMapOutputValueClass(VIntWritable.class); + + conf.setOutputKeyClass(DoubleWritable.class); + conf.setOutputValueClass(NullWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(TextOutputFormat.class); + + conf.setOutputKeyComparatorClass(hadoopUtils.DescendingDoubleComparator.class); //sort doubles in descending order + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //Extended Input + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //minValue and extra (more than k) elements + + conf.setMapperClass(advanced.CEPMapperNewEJS.class); + conf.setCombinerClass(blockingGraphPruning.CEPCombiner.class); + conf.setReducerClass(blockingGraphPruning.CEPReducer.class); + + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + conf.setInt("mapred.task.timeout", 10000000); + conf.setNumReduceTasks(1); + + conf.setCompressMapOutput(true); + + BufferedReader br = null, br2 = null, br3 = null; + try { + Path pt=new Path("/user/hduser/CEPk.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + br=new BufferedReader(new InputStreamReader(fs.open(pt))); + Integer K = Integer.parseInt(br.readLine()); + br.close(); + conf.setInt("K", K); + System.out.println("K="+K); + + Path cleanPath=new Path("/user/hduser/numBlocksClean.txt"); + Path dirtyPath=new Path("/user/hduser/numBlocksDirty.txt"); + br2=new BufferedReader(new InputStreamReader(fs.open(cleanPath))); + Integer cleanBlocks = Integer.parseInt(br2.readLine()); + conf.setInt("cleanBlocks", cleanBlocks); + br3=new BufferedReader(new InputStreamReader(fs.open(dirtyPath))); + Integer dirtyBlocks = Integer.parseInt(br3.readLine()); + conf.setInt("dirtyBlocks", dirtyBlocks); + + + Path pt2= new Path("/user/hduser/validComparisons.txt"); + br2=new BufferedReader(new InputStreamReader(fs.open(pt2))); + String validComparisons = br2.readLine(); + conf.set("validComparisons", validComparisons); + + } catch(Exception e){ + System.err.println(e.toString()); + } finally { + try { br.close(); br2.close();br3.close(); } + catch (IOException e) {System.err.println(e.toString());} + } + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/advanced/CEPEJSMapper.java b/MetaBlocking/src/main/java/advanced/CEPEJSMapper.java new file mode 100644 index 0000000..af052e2 --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/CEPEJSMapper.java @@ -0,0 +1,147 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import hadoopUtils.MBTools; +import java.io.IOException; +import java.text.DecimalFormat; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +import advanced.AverageWeightEJSMapper.Weight; + +public class CEPEJSMapper extends MapReduceBase implements Mapper { + + public enum OutputData {PURGED_BLOCKS}; + + private Long validComparisons; + private VIntWritable one = new VIntWritable(1); + private DoubleWritable weightToEmit = new DoubleWritable(); + + + public void configure (JobConf job) { + validComparisons = Long.parseLong(job.get("validComparisons")); + } + + + /** + * input: a blocking collection + * @param key block id + * @param value arrays of entity ids in this block (first element), along with the block ids (sorted) that contain them (remaining elements) + * e.g. [1,7,8,9],[3,1,8,10] means that in this block belong the entities 1 and 3 and entity 1 is placed in blocks 7,8,9 (sorted) and + * entity 3 is placed in blocks 1,8,10 + */ + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + reporter.setStatus("splitting the block "+key); + + Map> entityIndex = new TreeMap<>(); //key is entity id, value is the list of blocks that contain the key + List blocks; + String[] entityIndices = value.toString().split("]"); //each entityIndex is an array with the first element the entity and the rest elements its blocks + for (String tmpEntityIndex : entityIndices) { + if (tmpEntityIndex == null || tmpEntityIndex.length() < 2) {continue;} + tmpEntityIndex = tmpEntityIndex.substring(1); //to remove the initial '[' + String[] idsArray = tmpEntityIndex.split(", "); + int entityId = Integer.parseInt(idsArray[0]); + blocks = new ArrayList<>(idsArray.length-1); //maybe initial capacity is not needed + for (int i=1; i < idsArray.length; ++i) { + blocks.add(Integer.parseInt(idsArray[i])); + } + entityIndex.put(entityId, blocks); + } + + //dirty ER + /*List entities = new ArrayList<>(entityIndex.keySet()); + if (entities.size() < 2) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + }*/ + + + + //clean-clean ER + List D1entities = new ArrayList<>(); + List D2entities = new ArrayList<>(); + for (int entity : entityIndex.keySet()) { + if (entity < 0) { + D2entities.add(entity); + } else { + D1entities.add(entity); + } + } + if (D1entities.isEmpty() || D2entities.isEmpty()) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + } + + DecimalFormat df = new DecimalFormat("#.###"); //format doubles to keep only first 4 decimal points (saves space) + + //clean-clean ER + int blockId = key.get(); + List allValuesE1; + List allValuesE2; + List blockse1; + List blockse2; + int counter = 0; + int D1size = D1entities.size(); + + for (int e1 : D1entities) { + reporter.setStatus(++counter+"/"+D1size); + allValuesE1 = entityIndex.get(e1); //the last value is the cardinality of e1 + blockse1 = allValuesE1.subList(0, allValuesE1.size()-1); //the last value is the cardinality of e1 + for (int e2 : D2entities) { + allValuesE2 = entityIndex.get(e2); //the last value is the cardinality of e2 + blockse2 = allValuesE2.subList(0, allValuesE2.size()-1); //the last value is the cardinality of e2 + if (!MBTools.isRepeated(blockse1, blockse2, blockId)) { + Double weight = MBTools.getWeight(allValuesE1, allValuesE2, blockId, "EJS", 0, validComparisons); + weightToEmit.set(Double.parseDouble(df.format(weight))); + output.collect(weightToEmit, one); + } + } + } + + //dirty ER + /*int blockId = key.get(); + List allValuesE1; + List allValuesE2; + List blockse1; + List blockse2; + int counter = 0; + Integer []entitiesArray = new Integer[entities.size()]; + entitiesArray = entities.toArray(entitiesArray); + int blockSize = entitiesArray.length; + + for (int i = 0; i < blockSize-1; ++i) { + int e1 = entitiesArray[i]; + reporter.setStatus(++counter+"/"+blockSize); + allValuesE1 = entityIndex.get(e1); //the last value is the cardinality of e1 + blockse1 = allValuesE1.subList(0, allValuesE1.size()-1); //the last value is the cardinality of e1 + for (int j = i+1; j < blockSize; ++j) { + int e2 = entitiesArray[j]; + allValuesE2 = entityIndex.get(e2); //the last value is the cardinality of e2 + blockse2 = allValuesE2.subList(0, allValuesE2.size()-1); //the last value is the cardinality of e2 + + if (!MBTools.isRepeated(blockse1, blockse2, blockId)) { + Double weight = MBTools.getWeight(allValuesE1, allValuesE2, blockId, "EJS", 0, validComparisons); + weightToEmit.set(Double.parseDouble(df.format(weight))); + output.collect(weightToEmit, one); + } + } + }*/ + + } + + +} diff --git a/MetaBlocking/src/main/java/advanced/CEPFinalDriver.java b/MetaBlocking/src/main/java/advanced/CEPFinalDriver.java new file mode 100644 index 0000000..1344661 --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/CEPFinalDriver.java @@ -0,0 +1,106 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + + +public class CEPFinalDriver { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(advanced.CEPFinalDriver.class); + + conf.setJobName("CEP Final From Extended Input"); + + conf.setOutputKeyClass(Text.class); + conf.setOutputValueClass(DoubleWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + conf.set("weightingScheme", args[0]); //one of: CBS, ECBS, JS, EJS, ARCS + + FileInputFormat.setInputPaths(conf, new Path(args[1])); //Extended Input + FileOutputFormat.setOutputPath(conf, new Path(args[3])); //CEP + + BufferedReader br = null, br2 = null, br3 = null; + try{ + Path pt=new Path(args[2]+"/part-00000"); //CEPCounting From Extended Input + FileSystem fs = FileSystem.get(new Configuration()); + br=new BufferedReader(new InputStreamReader(fs.open(pt))); + String minValue = br.readLine(); + Integer extraElements = ((Double)Double.parseDouble(br.readLine())).intValue(); + conf.set("min", minValue); + conf.setInt("extra", extraElements); + System.out.println("min="+minValue); + System.out.println("extra="+extraElements); + + if (extraElements > 0) { //use a reducer to skip the extra elements + + conf.setMapperClass(advanced.CEPFinalMapperNew.class); + conf.setReducerClass(blockingGraphPruning.CEPFinalReducer.class); + + conf.setNumReduceTasks(56); + + conf.setMapOutputKeyClass(DoubleWritable.class); + conf.setMapOutputValueClass(Text.class); + } else { //don't use a reducer + conf.setMapperClass(advanced.CEPFinalMapperOnlyNew.class); + conf.setNumReduceTasks(0); + } + + Path cleanPath=new Path("/user/hduser/numBlocksClean.txt"); + Path dirtyPath=new Path("/user/hduser/numBlocksDirty.txt"); + br2=new BufferedReader(new InputStreamReader(fs.open(cleanPath))); + Integer cleanBlocks = Integer.parseInt(br2.readLine()); + conf.setInt("cleanBlocks", cleanBlocks); + br3=new BufferedReader(new InputStreamReader(fs.open(dirtyPath))); + Integer dirtyBlocks = Integer.parseInt(br3.readLine()); + conf.setInt("dirtyBlocks", dirtyBlocks); + + if (args[0].equals("EJS")) { + Path pt2= new Path("/user/hduser/validComparisons.txt"); + br2=new BufferedReader(new InputStreamReader(fs.open(pt2))); + String validComparisons = br2.readLine(); + conf.set("validComparisons", validComparisons); + } + + + } catch(Exception e){ + System.err.println(e.toString()); + } finally { + try { br.close(); br2.close();br3.close(); } + catch (IOException e) {System.err.println(e.toString());} + } + + conf.setInt("mapred.task.timeout", 10000000); //before the non-reporting task fails +// conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/advanced/CEPFinalEJSDriver.java b/MetaBlocking/src/main/java/advanced/CEPFinalEJSDriver.java new file mode 100644 index 0000000..8614789 --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/CEPFinalEJSDriver.java @@ -0,0 +1,111 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + + +public class CEPFinalEJSDriver { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(advanced.CEPFinalDriver.class); + + conf.setJobName("CEP Final From Extended Input (EJS)"); + + conf.setOutputKeyClass(Text.class); + conf.setOutputValueClass(DoubleWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //Extended Input + FileOutputFormat.setOutputPath(conf, new Path(args[2])); //CEP + + BufferedReader br = null, br2 = null, br3 = null; + try{ + Path pt=new Path(args[1]+"/part-00000"); //CEPCounting From Extended Input + FileSystem fs = FileSystem.get(new Configuration()); + br=new BufferedReader(new InputStreamReader(fs.open(pt))); + String minValue = br.readLine(); + Integer extraElements = ((Double)Double.parseDouble(br.readLine())).intValue(); + conf.set("min", minValue); + conf.setInt("extra", extraElements); + System.out.println("min="+minValue); + System.out.println("extra="+extraElements); + + if (extraElements > 0) { //use a reducer to skip the extra elements + + conf.setMapperClass(advanced.CEPFinalMapperNewEJS.class); + conf.setReducerClass(blockingGraphPruning.CEPFinalReducer.class); + + conf.setNumReduceTasks(56); + + conf.setMapOutputKeyClass(DoubleWritable.class); + conf.setMapOutputValueClass(Text.class); + } else { //don't use a reducer + conf.setMapperClass(advanced.CEPFinalMapperOnlyNewEJS.class); + conf.setNumReduceTasks(0); + } + + Path cleanPath=new Path("/user/hduser/numBlocksClean.txt"); + Path dirtyPath=new Path("/user/hduser/numBlocksDirty.txt"); + br2=new BufferedReader(new InputStreamReader(fs.open(cleanPath))); + Integer cleanBlocks = Integer.parseInt(br2.readLine()); + conf.setInt("cleanBlocks", cleanBlocks); + br3=new BufferedReader(new InputStreamReader(fs.open(dirtyPath))); + Integer dirtyBlocks = Integer.parseInt(br3.readLine()); + conf.setInt("dirtyBlocks", dirtyBlocks); + + + Path pt2= new Path("/user/hduser/validComparisons.txt"); + br2=new BufferedReader(new InputStreamReader(fs.open(pt2))); + String validComparisons = br2.readLine(); + conf.set("validComparisons", validComparisons); + + + + } catch(Exception e){ + System.err.println(e.toString()); + } finally { + try { br.close(); br2.close();br3.close(); } + catch (IOException e) {System.err.println(e.toString());} + } + + conf.setInt("mapred.task.timeout", 10000000); +// conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + conf.setMaxReduceTaskFailuresPercent(10); //acceptable failures before the whole job fails + conf.set("mapred.reduce.max.attempts", "10"); //before it is not started again + conf.set("mapred.max.tracker.failures", "100"); //before it gets black-listed + conf.set("mapred.job.tracker.handler.count", "40"); + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/advanced/CEPFinalEJSMapper.java b/MetaBlocking/src/main/java/advanced/CEPFinalEJSMapper.java new file mode 100644 index 0000000..1ca1608 --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/CEPFinalEJSMapper.java @@ -0,0 +1,148 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import hadoopUtils.MBTools; +import java.io.IOException; +import java.text.DecimalFormat; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +import advanced.AverageWeightEJSMapper.Weight; + +public class CEPFinalEJSMapper extends MapReduceBase implements Mapper { + + public enum OutputData {PURGED_BLOCKS}; + + double minValue; + + private Text comparison = new Text(); + private DoubleWritable weightToEmit = new DoubleWritable(); + private Long validComparisons; + + public void configure (JobConf conf) { + minValue = Double.parseDouble(conf.get("min", "0")); + validComparisons = Long.parseLong(conf.get("validComparisons")); + } + + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + reporter.setStatus("splitting the block "+key); + + Map> entityIndex = new TreeMap<>(); //key is entity id, value is the list of blocks that contain the key + List blocks; + String[] entityIndices = value.toString().split("]"); //each entityIndex is an array with the first element the entity and the rest elements its blocks + for (String tmpEntityIndex : entityIndices) { + if (tmpEntityIndex == null || tmpEntityIndex.length() < 2) {continue;} + tmpEntityIndex = tmpEntityIndex.substring(1); //to remove the initial '[' + String[] idsArray = tmpEntityIndex.split(", "); + int entityId = Integer.parseInt(idsArray[0]); + blocks = new ArrayList<>(idsArray.length-1); //maybe initial capacity is not needed + for (int i=1; i < idsArray.length; ++i) { + blocks.add(Integer.parseInt(idsArray[i])); + } + entityIndex.put(entityId, blocks); + } + + //dirty ER + /*List entities = new ArrayList<>(entityIndex.keySet()); + if (entities.size() < 2) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + }*/ + + + + //clean-clean ER + List D1entities = new ArrayList<>(); + List D2entities = new ArrayList<>(); + for (int entity : entityIndex.keySet()) { + if (entity < 0) { + D2entities.add(entity); + } else { + D1entities.add(entity); + } + } + if (D1entities.isEmpty() || D2entities.isEmpty()) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + } + + DecimalFormat df = new DecimalFormat("#.###"); //format doubles to keep only first 4 decimal points (saves space) + + + + //clean-clean ER + int blockId = key.get(); + List allValuesE1; + List allValuesE2; + List blockse1; + List blockse2; + int counter = 0; + int D1size = D1entities.size(); + + for (int e1 : D1entities) { + reporter.setStatus(++counter+"/"+D1size); + allValuesE1 = entityIndex.get(e1); //the last value is the cardinality of e1 + blockse1 = allValuesE1.subList(0, allValuesE1.size()-1); //the last value is the cardinality of e1 + for (int e2 : D2entities) { + allValuesE2 = entityIndex.get(e2); //the last value is the cardinality of e2 + blockse2 = allValuesE2.subList(0, allValuesE2.size()-1); //the last value is the cardinality of e2 + if (!MBTools.isRepeated(blockse1, blockse2, blockId)) { + Double weight = MBTools.getWeight(allValuesE1, allValuesE2, blockId, "EJS", 0, validComparisons); + if (weight >= minValue) { //edge belongs in top k+extraElements + comparison.set(e1+","+e2); + weightToEmit.set(Double.parseDouble(df.format(weight))); + output.collect(weightToEmit, comparison); + } + } + } + } + + //dirty ER + /*int blockId = key.get(); + List allValuesE1; + List allValuesE2; + List blockse1; + List blockse2; + int counter = 0; + Integer []entitiesArray = new Integer[entities.size()]; + entitiesArray = entities.toArray(entitiesArray); + int blockSize = entitiesArray.length; + + for (int i = 0; i < blockSize-1; ++i) { + int e1 = entitiesArray[i]; + reporter.setStatus(++counter+"/"+blockSize); + allValuesE1 = entityIndex.get(e1); //the last value is the cardinality of e1 + blockse1 = allValuesE1.subList(0, allValuesE1.size()-1); //the last value is the cardinality of e1 + for (int j = i+1; j < blockSize; ++j) { + int e2 = entitiesArray[j]; + allValuesE2 = entityIndex.get(e2); //the last value is the cardinality of e2 + blockse2 = allValuesE2.subList(0, allValuesE2.size()-1); //the last value is the cardinality of e2 + + if (!MBTools.isRepeated(blockse1, blockse2, blockId)) { + Double weight = MBTools.getWeight(allValuesE1, allValuesE2, blockId, "EJS", 0, validComparisons); + if (weight >= minValue) { //edge belongs in top k+extraElements + comparison.set(e1+","+e2); + weightToEmit.set(Double.parseDouble(df.format(weight))); + output.collect(weightToEmit, comparison); + } + } + } + }*/ + + } + + +} diff --git a/MetaBlocking/src/main/java/advanced/CEPFinalEJSMapperOnly.java b/MetaBlocking/src/main/java/advanced/CEPFinalEJSMapperOnly.java new file mode 100644 index 0000000..6e6484f --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/CEPFinalEJSMapperOnly.java @@ -0,0 +1,145 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import hadoopUtils.MBTools; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +public class CEPFinalEJSMapperOnly extends MapReduceBase implements Mapper { + + public enum OutputData {PURGED_BLOCKS, OUTPUT_RECORDS}; + + double minValue; + + private Text comparison = new Text(); + private DoubleWritable weightToEmit = new DoubleWritable(); + private Long validComparisons; + + public void configure (JobConf conf) { + minValue = Double.parseDouble(conf.get("min", "0")); + validComparisons = Long.parseLong(conf.get("validComparisons")); + } + + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + reporter.setStatus("splitting the block "+key); + + Map> entityIndex = new TreeMap<>(); //key is entity id, value is the list of blocks that contain the key + List blocks; + String[] entityIndices = value.toString().split("]"); //each entityIndex is an array with the first element the entity and the rest elements its blocks + for (String tmpEntityIndex : entityIndices) { + if (tmpEntityIndex == null || tmpEntityIndex.length() < 2) {continue;} + tmpEntityIndex = tmpEntityIndex.substring(1); //to remove the initial '[' + String[] idsArray = tmpEntityIndex.split(", "); + int entityId = Integer.parseInt(idsArray[0]); + blocks = new ArrayList<>(idsArray.length-1); //maybe initial capacity is not needed + for (int i=1; i < idsArray.length; ++i) { + blocks.add(Integer.parseInt(idsArray[i])); + } + entityIndex.put(entityId, blocks); + } + + //dirty ER + /*List entities = new ArrayList<>(entityIndex.keySet()); + if (entities.size() < 2) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + }*/ + + + + //clean-clean ER + List D1entities = new ArrayList<>(); + List D2entities = new ArrayList<>(); + for (int entity : entityIndex.keySet()) { + if (entity < 0) { + D2entities.add(entity); + } else { + D1entities.add(entity); + } + } + if (D1entities.isEmpty() || D2entities.isEmpty()) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + } + + + + //clean-clean ER + int blockId = key.get(); + List allValuesE1; + List allValuesE2; + List blockse1; + List blockse2; + int counter = 0; + int D1size = D1entities.size(); + + for (int e1 : D1entities) { + reporter.setStatus(++counter+"/"+D1size); + allValuesE1 = entityIndex.get(e1); //the last value is the cardinality of e1 + blockse1 = allValuesE1.subList(0, allValuesE1.size()-1); //the last value is the cardinality of e1 + for (int e2 : D2entities) { + allValuesE2 = entityIndex.get(e2); //the last value is the cardinality of e2 + blockse2 = allValuesE2.subList(0, allValuesE2.size()-1); //the last value is the cardinality of e2 + if (!MBTools.isRepeated(blockse1, blockse2, blockId)) { + Double weight = MBTools.getWeight(allValuesE1, allValuesE2, blockId, "EJS", 0, validComparisons); + if (weight >= minValue) { //edge belongs in top k+extraElements + comparison.set(e1+","+e2); + weightToEmit.set(weight); + //output.collect(comparison, weightToEmit); + reporter.incrCounter(OutputData.OUTPUT_RECORDS, 1);//to save space + } + } + } + } + + //dirty ER + /*int blockId = key.get(); + List allValuesE1; + List allValuesE2; + List blockse1; + List blockse2; + int counter = 0; + Integer []entitiesArray = new Integer[entities.size()]; + entitiesArray = entities.toArray(entitiesArray); + int blockSize = entitiesArray.length; + + for (int i = 0; i < blockSize-1; ++i) { + int e1 = entitiesArray[i]; + reporter.setStatus(++counter+"/"+blockSize); + allValuesE1 = entityIndex.get(e1); //the last value is the cardinality of e1 + blockse1 = allValuesE1.subList(0, allValuesE1.size()-1); //the last value is the cardinality of e1 + for (int j = i+1; j < blockSize; ++j) { + int e2 = entitiesArray[j]; + allValuesE2 = entityIndex.get(e2); //the last value is the cardinality of e2 + blockse2 = allValuesE2.subList(0, allValuesE2.size()-1); //the last value is the cardinality of e2 + + if (!MBTools.isRepeated(blockse1, blockse2, blockId)) { + Double weight = MBTools.getWeight(allValuesE1, allValuesE2, blockId, "EJS", 0, validComparisons); + if (weight >= minValue) { //edge belongs in top k+extraElements + comparison.set(e1+","+e2); + weightToEmit.set(weight); + //output.collect(comparison, weightToEmit); + reporter.incrCounter(OutputData.OUTPUT_RECORDS, 1);//to save space + } + } + } + }*/ + + } + + +} diff --git a/MetaBlocking/src/main/java/advanced/CEPFinalMapper.java b/MetaBlocking/src/main/java/advanced/CEPFinalMapper.java new file mode 100644 index 0000000..59c2e9b --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/CEPFinalMapper.java @@ -0,0 +1,141 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import hadoopUtils.MBTools; +import java.io.IOException; +import java.text.DecimalFormat; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +public class CEPFinalMapper extends MapReduceBase implements Mapper { + + public enum OutputData {PURGED_BLOCKS}; + + private String weightingScheme; + double minValue; + + private Text comparison = new Text(); + private DoubleWritable weightToEmit = new DoubleWritable(); + private int cleanBlocks; + private int dirtyBlocks; + + public void configure (JobConf conf) { + weightingScheme = conf.get("weightingScheme"); //one of ARCS,CBS,ECBS,JS,EJS + minValue = Double.parseDouble(conf.get("min", "0")); + cleanBlocks = conf.getInt("cleanBlocks", 0); + dirtyBlocks = conf.getInt("dirtyBlocks", 0); + } + + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + reporter.setStatus("splitting the block "+key); + + Map> entityIndex = new TreeMap<>(); //key is entity id, value is the list of blocks that contain the key + List blocks; + String[] entityIndices = value.toString().split("]"); //each entityIndex is an array with the first element the entity and the rest elements its blocks + for (String tmpEntityIndex : entityIndices) { + if (tmpEntityIndex == null || tmpEntityIndex.length() < 2) {continue;} + tmpEntityIndex = tmpEntityIndex.substring(1); //to remove the initial '[' + String[] idsArray = tmpEntityIndex.split(", "); + int entityId = Integer.parseInt(idsArray[0]); + blocks = new ArrayList<>(idsArray.length-1); //maybe initial capacity is not needed + for (int i=1; i < idsArray.length; ++i) { + blocks.add(Integer.parseInt(idsArray[i])); + } + entityIndex.put(entityId, blocks); + } + + //dirty ER + List entities = new ArrayList<>(entityIndex.keySet()); + if (entities.size() < 2) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + } + + + + //clean-clean ER + /*List D1entities = new ArrayList<>(); + List D2entities = new ArrayList<>(); + for (int entity : entityIndex.keySet()) { + if (entity < 0) { + D2entities.add(entity); + } else { + D1entities.add(entity); + } + } + if (D1entities.isEmpty() || D2entities.isEmpty()) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + }*/ + + + + //clean-clean ER + /*int blockId = key.get(); + List blockse1; + List blockse2; + int counter = 0; + int D1size = D1entities.size(); + DecimalFormat df = new DecimalFormat("#.####"); //format doubles to keep only first 4 decimal points (saves space) + //used for EJS (many decimal places) + + for (int e1 : D1entities) { + reporter.setStatus(++counter+"/"+D1size); + blockse1 = entityIndex.get(e1); + for (int e2 : D2entities) { + blockse2 = entityIndex.get(e2); + if (!MBTools.isRepeated(blockse1, blockse2, blockId, weightingScheme)) { + Double weight = MBTools.getWeight(blockse1, blockse2, blockId, weightingScheme, cleanBlocks, 0); + if (weight >= minValue) { //edge belongs in top k+extraElements + comparison.set(e1+","+e2); + weightToEmit.set(Double.parseDouble(df.format(weight))); + output.collect(weightToEmit, comparison); + } + } + } + }*/ + + //dirty ER + int blockId = key.get(); + List blockse1; + List blockse2; + int counter = 0; + Integer []entitiesArray = new Integer[entities.size()]; + entitiesArray = entities.toArray(entitiesArray); + int blockSize = entitiesArray.length; + + for (int i = 0; i < blockSize-1; ++i) { + int e1 = entitiesArray[i]; + reporter.setStatus(++counter+"/"+blockSize); + blockse1 = entityIndex.get(e1); + for (int j = i+1; j < blockSize; ++j) { + int e2 = entitiesArray[j]; + blockse2 = entityIndex.get(e2); + if (!MBTools.isRepeated(blockse1, blockse2, blockId)) { + Double weight = MBTools.getWeight(blockse1, blockse2, blockId, weightingScheme, dirtyBlocks); + if (weight >= minValue) { //edge belongs in top k+extraElements + comparison.set(e1+","+e2); + weightToEmit.set(weight); + output.collect(weightToEmit, comparison); + } + } + } + } + + } + + +} diff --git a/MetaBlocking/src/main/java/advanced/CEPFinalMapperNew.java b/MetaBlocking/src/main/java/advanced/CEPFinalMapperNew.java new file mode 100644 index 0000000..051d500 --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/CEPFinalMapperNew.java @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import hadoopUtils.MBTools; +import java.io.IOException; +import java.text.DecimalFormat; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +import advanced.AverageWeightMapperNewFromCompressed.Weight; +import advanced.CEPMapperNew.OutputData; + +public class CEPFinalMapperNew extends MapReduceBase implements Mapper { + + public enum OutputData {PURGED_BLOCKS}; + + private String weightingScheme; + double minValue; + + private Text comparison = new Text(); + private DoubleWritable weightToEmit = new DoubleWritable(); + private int cleanBlocks; + private int dirtyBlocks; + private long validComparisons; + + public void configure (JobConf conf) { + weightingScheme = conf.get("weightingScheme"); //one of ARCS,CBS,ECBS,JS,EJS + minValue = Double.parseDouble(conf.get("min", "0")); + cleanBlocks = conf.getInt("cleanBlocks", 0); + dirtyBlocks = conf.getInt("dirtyBlocks", 0); + validComparisons = Long.parseLong(conf.get("validComparisons","0")); + } + + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + reporter.setStatus("splitting the block "+key); + + String[] entityIndices = value.toString().split("]"); + int noOfEntities = entityIndices.length; + /*//dirty ER + if (noOfEntities < 2) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + }*/ + + boolean containsPositive = false; //clean-clean (in case this check is NOT made in ExtendeInput Reducer) + boolean containsNegative = false; //clean-clean (in case this check is NOT made in ExtendeInput Reducer) +// + + int counter = 0; + int[] entityIds = new int[noOfEntities]; + int[][] entityBlocks = new int[noOfEntities][]; + for (String tmpEntityIndex : entityIndices) { + reporter.progress(); + String[] idsArray = tmpEntityIndex.split(", "); + entityIds[counter] = Integer.parseInt(idsArray[0].substring(1)); + + if (entityIds[counter] >= 0) containsPositive = true; else containsNegative = true; //clean-clean (in case this check is NOT made in ExtendeInput Reducer) + + int noOfBlocks = idsArray.length-1; + entityBlocks[counter] = new int[noOfBlocks]; + for (int i=0; i < noOfBlocks; ++i) { + entityBlocks[counter][i] = Integer.parseInt(idsArray[i+1]); + } + counter++; + } + + + //dirty ER + /*int blockId = key.get(); + + for (int i = 0; i < noOfEntities-1; ++i) { + reporter.setStatus(++counter+"/"+noOfEntities); + for (int j = i+1; j < noOfEntities; ++j) { + double weight = MBTools.getWeight(blockId, entityBlocks[i], entityBlocks[j], weightingScheme, dirtyBlocks, validComparisons); + if (weight >= minValue) { + comparison.set(entityIds[i]+","+entityIds[j]); + weightToEmit.set(weight); + output.collect(weightToEmit, comparison); + } + } + }*/ + + + //clean-clean ER + if (!(containsNegative && containsPositive)) { //unless this check is made in ExtendeInput Reducer + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; //no comparisons from this block + } + + int blockId = key.get(); + for (int i = 0; i < noOfEntities-1; ++i) { + reporter.setStatus(++counter+"/"+noOfEntities); + for (int j = i+1; j < noOfEntities; ++j) { + if ((entityIds[i] ^ entityIds[j]) >> 31 == 0) { //equal sign bit? (30% faster than if ((e1 > 0 && e2 > 0) || (e1 < 0 && e2 < 0)) ) + continue; + } + double weight = MBTools.getWeight(blockId, entityBlocks[i], entityBlocks[j], weightingScheme, cleanBlocks, validComparisons); + if (weight >= minValue) { + comparison.set(entityIds[i]+","+entityIds[j]); + weightToEmit.set(weight); + output.collect(weightToEmit, comparison); + } + } + } + } +} diff --git a/MetaBlocking/src/main/java/advanced/CEPFinalMapperNewEJS.java b/MetaBlocking/src/main/java/advanced/CEPFinalMapperNewEJS.java new file mode 100644 index 0000000..8eeb07d --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/CEPFinalMapperNewEJS.java @@ -0,0 +1,121 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import hadoopUtils.MBTools; +import java.io.IOException; +import java.text.DecimalFormat; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +import advanced.AverageWeightMapperNewFromCompressed.Weight; +import advanced.CEPMapperNew.OutputData; + +public class CEPFinalMapperNewEJS extends MapReduceBase implements Mapper { + + public enum OutputData {PURGED_BLOCKS}; + + private String weightingScheme; + double minValue; + + private Text comparison = new Text(); + private DoubleWritable weightToEmit = new DoubleWritable(); + private int cleanBlocks; + private int dirtyBlocks; + private long validComparisons; + + public void configure (JobConf conf) { + weightingScheme = "EJS"; + minValue = Double.parseDouble(conf.get("min", "0")); + cleanBlocks = conf.getInt("cleanBlocks", 0); + dirtyBlocks = conf.getInt("dirtyBlocks", 0); + validComparisons = Long.parseLong(conf.get("validComparisons","0")); + } + + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + reporter.setStatus("splitting the block "+key); + + String[] entityIndices = value.toString().split("]"); + int noOfEntities = entityIndices.length; + /*//dirty ER + if (noOfEntities < 2) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + }*/ + + boolean containsPositive = false; //clean-clean (in case this check is NOT made in ExtendeInput Reducer) + boolean containsNegative = false; //clean-clean (in case this check is NOT made in ExtendeInput Reducer) + + + int counter = 0; + int[] entityIds = new int[noOfEntities]; + int[][] entityBlocks = new int[noOfEntities][]; + for (String tmpEntityIndex : entityIndices) { + reporter.progress(); + String[] idsArray = tmpEntityIndex.split(", "); + entityIds[counter] = Integer.parseInt(idsArray[0].substring(1)); + + if (entityIds[counter] >= 0) containsPositive = true; else containsNegative = true; //clean-clean (in case this check is NOT made in ExtendeInput Reducer) + + int noOfBlocks = idsArray.length-1; + entityBlocks[counter] = new int[noOfBlocks]; + for (int i=0; i < noOfBlocks; ++i) { + entityBlocks[counter][i] = Integer.parseInt(idsArray[i+1]); + } + counter++; + } + + DecimalFormat df = new DecimalFormat("#.###"); //format doubles to keep only first 4 decimal points (saves space) + + + //dirty ER + /*int blockId = key.get(); + + for (int i = 0; i < noOfEntities-1; ++i) { + reporter.setStatus(++counter+"/"+noOfEntities); + for (int j = i+1; j < noOfEntities; ++j) { + double weight = MBTools.getWeight(blockId, entityBlocks[i], entityBlocks[j], weightingScheme, dirtyBlocks, validComparisons); + if (weight >= minValue) { + comparison.set(entityIds[i]+","+entityIds[j]); + weightToEmit.set(Double.parseDouble(df.format(weight))); + output.collect(weightToEmit, comparison); + } + } + }*/ + + + //clean-clean ER + if (!(containsNegative && containsPositive)) { //unless this check is made in ExtendeInput Reducer + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; //no comparisons from this block + } + + int blockId = key.get(); + for (int i = 0; i < noOfEntities-1; ++i) { + reporter.setStatus(++counter+"/"+noOfEntities); + for (int j = i+1; j < noOfEntities; ++j) { + if ((entityIds[i] ^ entityIds[j]) >> 31 == 0) { //equal sign bit? (30% faster than if ((e1 > 0 && e2 > 0) || (e1 < 0 && e2 < 0)) ) + continue; + } + double weight = MBTools.getWeight(blockId, entityBlocks[i], entityBlocks[j], weightingScheme, cleanBlocks, validComparisons); + if (weight >= minValue) { + comparison.set(entityIds[i]+","+entityIds[j]); + weightToEmit.set(Double.parseDouble(df.format(weight))); + output.collect(weightToEmit, comparison); + } + } + } + } +} diff --git a/MetaBlocking/src/main/java/advanced/CEPFinalMapperOnly.java b/MetaBlocking/src/main/java/advanced/CEPFinalMapperOnly.java new file mode 100644 index 0000000..61790e0 --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/CEPFinalMapperOnly.java @@ -0,0 +1,138 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import hadoopUtils.MBTools; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +public class CEPFinalMapperOnly extends MapReduceBase implements Mapper { + + public enum OutputData {PURGED_BLOCKS}; + + private String weightingScheme; + double minValue; + + private Text comparison = new Text(); + private DoubleWritable weightToEmit = new DoubleWritable(); + private int cleanBlocks; + private int dirtyBlocks; + + public void configure (JobConf conf) { + weightingScheme = conf.get("weightingScheme"); //one of ARCS,CBS,ECBS,JS,EJS + minValue = Double.parseDouble(conf.get("min", "0")); + cleanBlocks = conf.getInt("cleanBlocks", 0); + dirtyBlocks = conf.getInt("dirtyBlocks", 0); + } + + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + reporter.setStatus("splitting the block "+key); + + Map> entityIndex = new TreeMap<>(); //key is entity id, value is the list of blocks that contain the key + List blocks; + String[] entityIndices = value.toString().split("]"); //each entityIndex is an array with the first element the entity and the rest elements its blocks + for (String tmpEntityIndex : entityIndices) { + if (tmpEntityIndex == null || tmpEntityIndex.length() < 2) {continue;} + tmpEntityIndex = tmpEntityIndex.substring(1); //to remove the initial '[' + String[] idsArray = tmpEntityIndex.split(", "); + int entityId = Integer.parseInt(idsArray[0]); + blocks = new ArrayList<>(idsArray.length-1); //maybe initial capacity is not needed + for (int i=1; i < idsArray.length; ++i) { + blocks.add(Integer.parseInt(idsArray[i])); + } + entityIndex.put(entityId, blocks); + } + + //dirty ER + List entities = new ArrayList<>(entityIndex.keySet()); + if (entities.size() < 2) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + } + + + + //clean-clean ER + /*List D1entities = new ArrayList<>(); + List D2entities = new ArrayList<>(); + for (int entity : entityIndex.keySet()) { + if (entity < 0) { + D2entities.add(entity); + } else { + D1entities.add(entity); + } + } + if (D1entities.isEmpty() || D2entities.isEmpty()) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + }*/ + + + + //clean-clean ER + /*int blockId = key.get(); + List blockse1; + List blockse2; + int counter = 0; + int D1size = D1entities.size(); + + for (int e1 : D1entities) { + reporter.setStatus(++counter+"/"+D1size); + blockse1 = entityIndex.get(e1); + for (int e2 : D2entities) { + blockse2 = entityIndex.get(e2); + if (!MBTools.isRepeated(blockse1, blockse2, blockId, weightingScheme)) { + Double weight = MBTools.getWeight(blockse1, blockse2, blockId, weightingScheme, cleanBlocks, 0); + if (weight >= minValue) { //edge belongs in top k+extraElements + comparison.set(e1+","+e2); + weightToEmit.set(weight); + output.collect(comparison, weightToEmit); + } + } + } + }*/ + + //dirty ER + int blockId = key.get(); + List blockse1; + List blockse2; + int counter = 0; + Integer []entitiesArray = new Integer[entities.size()]; + entitiesArray = entities.toArray(entitiesArray); + int blockSize = entitiesArray.length; + + for (int i = 0; i < blockSize-1; ++i) { + int e1 = entitiesArray[i]; + reporter.setStatus(++counter+"/"+blockSize); + blockse1 = entityIndex.get(e1); + for (int j = i+1; j < blockSize; ++j) { + int e2 = entitiesArray[j]; + blockse2 = entityIndex.get(e2); + if (!MBTools.isRepeated(blockse1, blockse2, blockId)) { + Double weight = MBTools.getWeight(blockse1, blockse2, blockId, weightingScheme, dirtyBlocks); + if (weight >= minValue) { //edge belongs in top k+extraElements + comparison.set(e1+","+e2); + weightToEmit.set(weight); + output.collect(comparison, weightToEmit); + } + } + } + } + + } + + +} diff --git a/MetaBlocking/src/main/java/advanced/CEPFinalMapperOnlyNew.java b/MetaBlocking/src/main/java/advanced/CEPFinalMapperOnlyNew.java new file mode 100644 index 0000000..3d5b06b --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/CEPFinalMapperOnlyNew.java @@ -0,0 +1,116 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import hadoopUtils.MBTools; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +import advanced.CEPFinalMapperNew.OutputData; + +public class CEPFinalMapperOnlyNew extends MapReduceBase implements Mapper { + + public enum OutputData {PURGED_BLOCKS}; + + private String weightingScheme; + double minValue; + + private Text comparison = new Text(); + private DoubleWritable weightToEmit = new DoubleWritable(); + private int cleanBlocks; + private int dirtyBlocks; + private long validComparisons; + + public void configure (JobConf conf) { + weightingScheme = conf.get("weightingScheme"); //one of ARCS,CBS,ECBS,JS,EJS + minValue = Double.parseDouble(conf.get("min", "0")); + cleanBlocks = conf.getInt("cleanBlocks", 0); + dirtyBlocks = conf.getInt("dirtyBlocks", 0); + validComparisons = Long.parseLong(conf.get("validComparisons","0")); + } + + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + reporter.setStatus("splitting the block "+key); + + String[] entityIndices = value.toString().split("]"); + int noOfEntities = entityIndices.length; + /*//dirty ER + if (noOfEntities < 2) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + }*/ + + boolean containsPositive = false; //clean-clean (in case this check is NOT made in ExtendeInput Reducer) + boolean containsNegative = false; //clean-clean (in case this check is NOT made in ExtendeInput Reducer) + + + int counter = 0; + int[] entityIds = new int[noOfEntities]; + int[][] entityBlocks = new int[noOfEntities][]; + for (String tmpEntityIndex : entityIndices) { + reporter.progress(); + String[] idsArray = tmpEntityIndex.split(", "); + entityIds[counter] = Integer.parseInt(idsArray[0].substring(1)); + + if (entityIds[counter] >= 0) containsPositive = true; else containsNegative = true; //clean-clean (in case this check is NOT made in ExtendeInput Reducer) + + int noOfBlocks = idsArray.length-1; + entityBlocks[counter] = new int[noOfBlocks]; + for (int i=0; i < noOfBlocks; ++i) { + entityBlocks[counter][i] = Integer.parseInt(idsArray[i+1]); + } + counter++; + } + + //dirty ER + /*int blockId = key.get(); + + for (int i = 0; i < noOfEntities-1; ++i) { + reporter.setStatus(++counter+"/"+noOfEntities); + for (int j = i+1; j < noOfEntities; ++j) { + double weight = MBTools.getWeight(blockId, entityBlocks[i], entityBlocks[j], weightingScheme, dirtyBlocks, validComparisons); + if (weight >= minValue) { + comparison.set(entityIds[i]+","+entityIds[j]); + weightToEmit.set(weight); + output.collect(comparison, weightToEmit); + } + } + }*/ + + + //clean-clean ER + if (!(containsNegative && containsPositive)) { //unless this check is made in ExtendeInput Reducer + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; //no comparisons from this block + } + + int blockId = key.get(); + for (int i = 0; i < noOfEntities-1; ++i) { + reporter.setStatus(++counter+"/"+noOfEntities); + for (int j = i+1; j < noOfEntities; ++j) { + if ((entityIds[i] ^ entityIds[j]) >> 31 == 0) { //equal sign bit? (30% faster than if ((e1 > 0 && e2 > 0) || (e1 < 0 && e2 < 0)) ) + continue; + } + double weight = MBTools.getWeight(blockId, entityBlocks[i], entityBlocks[j], weightingScheme, cleanBlocks, validComparisons); + if (weight >= minValue) { + comparison.set(entityIds[i]+","+entityIds[j]); + weightToEmit.set(weight); + output.collect(comparison, weightToEmit); + } + } + } + } +} diff --git a/MetaBlocking/src/main/java/advanced/CEPFinalMapperOnlyNewEJS.java b/MetaBlocking/src/main/java/advanced/CEPFinalMapperOnlyNewEJS.java new file mode 100644 index 0000000..5e7255d --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/CEPFinalMapperOnlyNewEJS.java @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import hadoopUtils.MBTools; +import java.io.IOException; +import java.text.DecimalFormat; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +import advanced.CEPFinalMapperNew.OutputData; + +public class CEPFinalMapperOnlyNewEJS extends MapReduceBase implements Mapper { + + public enum OutputData {PURGED_BLOCKS}; + + private String weightingScheme; + double minValue; + + private Text comparison = new Text(); + private DoubleWritable weightToEmit = new DoubleWritable(); + private int cleanBlocks; + private int dirtyBlocks; + private long validComparisons; + + public void configure (JobConf conf) { + weightingScheme = "EJS"; + minValue = Double.parseDouble(conf.get("min", "0")); + cleanBlocks = conf.getInt("cleanBlocks", 0); + dirtyBlocks = conf.getInt("dirtyBlocks", 0); + validComparisons = Long.parseLong(conf.get("validComparisons","0")); + } + + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + reporter.setStatus("splitting the block "+key); + + String[] entityIndices = value.toString().split("]"); + int noOfEntities = entityIndices.length; + /*//dirty ER + if (noOfEntities < 2) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + }*/ + + boolean containsPositive = false; //clean-clean (in case this check is NOT made in ExtendeInput Reducer) + boolean containsNegative = false; //clean-clean (in case this check is NOT made in ExtendeInput Reducer) + + + int counter = 0; + int[] entityIds = new int[noOfEntities]; + int[][] entityBlocks = new int[noOfEntities][]; + for (String tmpEntityIndex : entityIndices) { + reporter.progress(); + String[] idsArray = tmpEntityIndex.split(", "); + entityIds[counter] = Integer.parseInt(idsArray[0].substring(1)); + + if (entityIds[counter] >= 0) containsPositive = true; else containsNegative = true; //clean-clean (in case this check is NOT made in ExtendeInput Reducer) + + int noOfBlocks = idsArray.length-1; + entityBlocks[counter] = new int[noOfBlocks]; + for (int i=0; i < noOfBlocks; ++i) { + entityBlocks[counter][i] = Integer.parseInt(idsArray[i+1]); + } + counter++; + } + + DecimalFormat df = new DecimalFormat("#.###"); //format doubles to keep only first 4 decimal points (saves space) + + //dirty ER + /* int blockId = key.get(); + + for (int i = 0; i < noOfEntities-1; ++i) { + reporter.setStatus(++counter+"/"+noOfEntities); + for (int j = i+1; j < noOfEntities; ++j) { + double weight = MBTools.getWeight(blockId, entityBlocks[i], entityBlocks[j], weightingScheme, dirtyBlocks, validComparisons); + if (weight >= minValue) { + comparison.set(entityIds[i]+","+entityIds[j]); + weightToEmit.set(Double.parseDouble(df.format(weight))); + output.collect(comparison, weightToEmit); + } + } + }*/ + + + //clean-clean ER + if (!(containsNegative && containsPositive)) { //unless this check is made in ExtendeInput Reducer + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; //no comparisons from this block + } + + int blockId = key.get(); + for (int i = 0; i < noOfEntities-1; ++i) { + reporter.setStatus(++counter+"/"+noOfEntities); + for (int j = i+1; j < noOfEntities; ++j) { + if ((entityIds[i] ^ entityIds[j]) >> 31 == 0) { //equal sign bit? (30% faster than if ((e1 > 0 && e2 > 0) || (e1 < 0 && e2 < 0)) ) + continue; + } + double weight = MBTools.getWeight(blockId, entityBlocks[i], entityBlocks[j], weightingScheme, cleanBlocks, validComparisons); + if (weight >= minValue) { + comparison.set(entityIds[i]+","+entityIds[j]); + weightToEmit.set(Double.parseDouble(df.format(weight))); + output.collect(comparison, weightToEmit); + } + } + } + } +} diff --git a/MetaBlocking/src/main/java/advanced/CEPMapper.java b/MetaBlocking/src/main/java/advanced/CEPMapper.java new file mode 100644 index 0000000..9c70328 --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/CEPMapper.java @@ -0,0 +1,138 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import hadoopUtils.MBTools; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +public class CEPMapper extends MapReduceBase implements Mapper { + + public enum OutputData {PURGED_BLOCKS}; + + private String weightingScheme; + private VIntWritable one = new VIntWritable(1); + private DoubleWritable weightToEmit = new DoubleWritable(); + private int cleanBlocks; + private int dirtyBlocks; + + public void configure (JobConf job) { + weightingScheme = job.get("weightingScheme"); //one of ARCS,CBS,ECBS,JS,EJS + cleanBlocks = job.getInt("cleanBlocks", 0); + dirtyBlocks = job.getInt("dirtyBlocks", 0); + } + + + /** + * input: a blocking collection + * @param key block id + * @param value arrays of entity ids in this block (first element), along with the block ids (sorted) that contain them (remaining elements) + * e.g. [1,7,8,9],[3,1,8,10] means that in this block belong the entities 1 and 3 and entity 1 is placed in blocks 7,8,9 (sorted) and + * entity 3 is placed in blocks 1,8,10 + */ + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + reporter.setStatus("splitting the block "+key); + + Map> entityIndex = new TreeMap<>(); //key is entity id, value is the list of blocks that contain the key + List blocks; + String[] entityIndices = value.toString().split("]"); //each entityIndex is an array with the first element the entity and the rest elements its blocks + for (String tmpEntityIndex : entityIndices) { + if (tmpEntityIndex == null || tmpEntityIndex.length() < 2) {continue;} + tmpEntityIndex = tmpEntityIndex.substring(1); //to remove the initial '[' + String[] idsArray = tmpEntityIndex.split(", "); + int entityId = Integer.parseInt(idsArray[0]); + blocks = new ArrayList<>(idsArray.length-1); //maybe initial capacity is not needed + for (int i=1; i < idsArray.length; ++i) { + blocks.add(Integer.parseInt(idsArray[i])); + } + entityIndex.put(entityId, blocks); + } + + //dirty ER + List entities = new ArrayList<>(entityIndex.keySet()); + if (entities.size() < 2) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + } + + + + //clean-clean ER + /*List D1entities = new ArrayList<>(); + List D2entities = new ArrayList<>(); + for (int entity : entityIndex.keySet()) { + if (entity < 0) { + D2entities.add(entity); + } else { + D1entities.add(entity); + } + } + if (D1entities.isEmpty() || D2entities.isEmpty()) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + }*/ + + + + //clean-clean ER + /*int blockId = key.get(); + List blockse1; + List blockse2; + int counter = 0; + int D1size = D1entities.size(); + + for (int e1 : D1entities) { + reporter.setStatus(++counter+"/"+D1size); + blockse1 = entityIndex.get(e1); + for (int e2 : D2entities) { + blockse2 = entityIndex.get(e2); + if (!MBTools.isRepeated(blockse1, blockse2, blockId, weightingScheme)) { + Double weight = MBTools.getWeight(blockse1, blockse2, blockId, weightingScheme, cleanBlocks, 0); + weightToEmit.set(weight); + output.collect(weightToEmit, one); + } + } + }*/ + + //dirty ER + int blockId = key.get(); + List blockse1; + List blockse2; + int counter = 0; + Integer []entitiesArray = new Integer[entities.size()]; + entitiesArray = entities.toArray(entitiesArray); + int blockSize = entitiesArray.length; + + for (int i = 0; i < blockSize-1; ++i) { + int e1 = entitiesArray[i]; + reporter.setStatus(++counter+"/"+blockSize); + blockse1 = entityIndex.get(e1); + for (int j = i+1; j < blockSize; ++j) { + int e2 = entitiesArray[j]; + blockse2 = entityIndex.get(e2); + if (!MBTools.isRepeated(blockse1, blockse2, blockId)) { + Double weight = MBTools.getWeight(blockse1, blockse2, blockId, weightingScheme, dirtyBlocks); + weightToEmit.set(weight); + output.collect(weightToEmit, one); + } + } + } + + } + + +} diff --git a/MetaBlocking/src/main/java/advanced/CEPMapperNew.java b/MetaBlocking/src/main/java/advanced/CEPMapperNew.java new file mode 100644 index 0000000..74ac22f --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/CEPMapperNew.java @@ -0,0 +1,113 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import hadoopUtils.MBTools; +import java.io.IOException; + + +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +public class CEPMapperNew extends MapReduceBase implements Mapper { + + public enum OutputData {PURGED_BLOCKS}; + + private String weightingScheme; + private VIntWritable one = new VIntWritable(1); + private DoubleWritable weightToEmit = new DoubleWritable(); + private int cleanBlocks; + private int dirtyBlocks; + private long validComparisons; + + public void configure (JobConf job) { + weightingScheme = job.get("weightingScheme"); //one of ARCS,CBS,ECBS,JS,EJS + cleanBlocks = job.getInt("cleanBlocks", 0); + dirtyBlocks = job.getInt("dirtyBlocks", 0); + validComparisons = Long.parseLong(job.get("validComparisons","0")); + } + + + /** + * input: a blocking collection + * @param key block id + * @param value arrays of entity ids in this block (first element), along with the block ids (sorted) that contain them (remaining elements) + * e.g. [1,7,8,9],[3,1,8,10] means that in this block belong the entities 1 and 3 and entity 1 is placed in blocks 7,8,9 (sorted) and + * entity 3 is placed in blocks 1,8,10 + */ + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + reporter.setStatus("splitting the block "+key); + + String[] entityIndices = value.toString().split("]"); + int noOfEntities = entityIndices.length; + /*//dirty ER + if (noOfEntities < 2) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + }*/ + + boolean containsPositive = false; //clean-clean (in case this check is NOT made in ExtendeInput Reducer) + boolean containsNegative = false; //clean-clean (in case this check is NOT made in ExtendeInput Reducer) + + int counter = 0; + int[] entityIds = new int[noOfEntities]; + int[][] entityBlocks = new int[noOfEntities][]; + for (String tmpEntityIndex : entityIndices) { + String[] idsArray = tmpEntityIndex.split(", "); + entityIds[counter] = Integer.parseInt(idsArray[0].substring(1)); //first is entity id + + if (entityIds[counter] >= 0) containsPositive = true; else containsNegative = true; //clean-clean (in case this check is NOT made in ExtendeInput Reducer) + + int noOfBlocks = idsArray.length-1; + entityBlocks[counter] = new int[noOfBlocks]; + for (int i=0; i < noOfBlocks; ++i) { + entityBlocks[counter][i] = Integer.parseInt(idsArray[i+1]); + } + counter++; + } + + + //dirty ER + /*int blockId = key.get(); + + for (int i = 0; i < noOfEntities-1; ++i) { + reporter.setStatus(++counter+"/"+noOfEntities); + for (int j = i+1; j < noOfEntities; ++j) { + double weight = MBTools.getWeight(blockId, entityBlocks[i], entityBlocks[j], weightingScheme, dirtyBlocks, validComparisons); + if (weight > 0) { + weightToEmit.set(weight); + output.collect(weightToEmit, one); + } + } + }*/ + + //clean-clean ER + if (!(containsNegative && containsPositive)) { //unless this check is made in ExtendeInput Reducer + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; //no comparisons from this block + } + + int blockId = key.get(); + for (int i = 0; i < noOfEntities-1; ++i) { + reporter.setStatus(++counter+"/"+noOfEntities); + for (int j = i+1; j < noOfEntities; ++j) { + if ((entityIds[i] ^ entityIds[j]) >> 31 == 0) { //equal sign bit? (30% faster than if ((e1 > 0 && e2 > 0) || (e1 < 0 && e2 < 0)) ) + continue; + } + double weight = MBTools.getWeight(blockId, entityBlocks[i], entityBlocks[j], weightingScheme, cleanBlocks, validComparisons); + if (weight > 0) { + weightToEmit.set(weight); + output.collect(weightToEmit, one); + } + } + } + } +} diff --git a/MetaBlocking/src/main/java/advanced/CEPMapperNewEJS.java b/MetaBlocking/src/main/java/advanced/CEPMapperNewEJS.java new file mode 100644 index 0000000..90b5269 --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/CEPMapperNewEJS.java @@ -0,0 +1,114 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import hadoopUtils.MBTools; +import java.io.IOException; +import java.text.DecimalFormat; + +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +public class CEPMapperNewEJS extends MapReduceBase implements Mapper { + + public enum OutputData {PURGED_BLOCKS}; + + private String weightingScheme; + private VIntWritable one = new VIntWritable(1); + private DoubleWritable weightToEmit = new DoubleWritable(); + private int cleanBlocks; + private int dirtyBlocks; + private long validComparisons; + + public void configure (JobConf job) { + weightingScheme = "EJS"; + cleanBlocks = job.getInt("cleanBlocks", 0); + dirtyBlocks = job.getInt("dirtyBlocks", 0); + validComparisons = Long.parseLong(job.get("validComparisons","0")); + } + + + /** + * input: a blocking collection + * @param key block id + * @param value arrays of entity ids in this block (first element), along with the block ids (sorted) that contain them (remaining elements) + * e.g. [1,7,8,9],[3,1,8,10] means that in this block belong the entities 1 and 3 and entity 1 is placed in blocks 7,8,9 (sorted) and + * entity 3 is placed in blocks 1,8,10 + */ + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + reporter.setStatus("splitting the block "+key); + + String[] entityIndices = value.toString().split("]"); + int noOfEntities = entityIndices.length; + /*//dirty ER + if (noOfEntities < 2) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + }*/ + +// boolean containsPositive = false; //clean-clean (in case this check is NOT made in ExtendeInput Reducer) +// boolean containsNegative = false; //clean-clean (in case this check is NOT made in ExtendeInput Reducer) + + int counter = 0; + int[] entityIds = new int[noOfEntities]; + int[][] entityBlocks = new int[noOfEntities][]; + for (String tmpEntityIndex : entityIndices) { + String[] idsArray = tmpEntityIndex.split(", "); + entityIds[counter] = Integer.parseInt(idsArray[0].substring(1)); //first is entity id + +// if (entityIds[counter] >= 0) containsPositive = true; else containsNegative = true; //clean-clean (in case this check is NOT made in ExtendeInput Reducer) + + int noOfBlocks = idsArray.length-1; + entityBlocks[counter] = new int[noOfBlocks]; + for (int i=0; i < noOfBlocks; ++i) { + entityBlocks[counter][i] = Integer.parseInt(idsArray[i+1]); + } + counter++; + } + + DecimalFormat df = new DecimalFormat("#.###"); //format doubles to keep only first 4 decimal points (saves space) + + //dirty ER + int blockId = key.get(); + + for (int i = 0; i < noOfEntities-1; ++i) { + reporter.setStatus(++counter+"/"+noOfEntities); + for (int j = i+1; j < noOfEntities; ++j) { + double weight = MBTools.getWeight(blockId, entityBlocks[i], entityBlocks[j], weightingScheme, dirtyBlocks, validComparisons); + if (weight > 0) { + weightToEmit.set(Double.parseDouble(df.format(weight))); + output.collect(weightToEmit, one); + } + } + } + + //clean-clean ER + /*if (!(containsNegative && containsPositive)) { //unless this check is made in ExtendeInput Reducer + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; //no comparisons from this block + } + + int blockId = key.get(); + for (int i = 0; i < noOfEntities-1; ++i) { + reporter.setStatus(++counter+"/"+noOfEntities); + for (int j = i+1; j < noOfEntities; ++j) { + if ((entityIds[i] ^ entityIds[j]) >> 31 == 0) { //equal sign bit? (30% faster than if ((e1 > 0 && e2 > 0) || (e1 < 0 && e2 < 0)) ) + continue; + } + double weight = MBTools.getWeight(blockId, entityBlocks[i], entityBlocks[j], weightingScheme, cleanBlocks, validComparisons); + if (weight > 0) { + weightToEmit.set(Double.parseDouble(df.format(weight))); + output.collect(weightToEmit, one); + } + } + } */ + } +} diff --git a/MetaBlocking/src/main/java/advanced/CNP.java b/MetaBlocking/src/main/java/advanced/CNP.java new file mode 100644 index 0000000..dda2a49 --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/CNP.java @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import java.io.IOException; +import java.util.Collections; +import java.util.Iterator; +import java.util.Map; +import java.util.TreeMap; + +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +public class CNP extends MapReduceBase implements Reducer { + + + private int k; //for topK + public void configure (JobConf job) { + float BCin = job.getFloat("BCin", 1.0f); + k = ((Double)Math.floor(BCin - 1)).intValue(); + } + + Text keyToEmit = new Text(); + DoubleWritable valueToEmit = new DoubleWritable(); + + /** + * output for each input node its edges with weight in top k weights + * @param key i entity id + * @param value list of j,wij (entity id, weight of edge i-j) + * @param output key:i,j value:wij for wij in top k weights + */ + public void reduce(VIntWritable key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + //sort neighbors in descending order of weight (key=weigh, value=neighborID) + Map neighbors = new TreeMap<>(Collections.reverseOrder()); + + while (values.hasNext()) { + String[] value = values.next().toString().split(","); + Double weight = Double.parseDouble(value[1]); + Integer neighbor = key.get() + Integer.parseInt(value[0]); //e2 is compressed as e2-e1, so we add e1 (i.e. key) + neighbors.put(weight, neighbor); + } + + int keyId = key.get(); + + //Emit top k edges (k nearest neighbors) + for (Map.Entry edge : neighbors.entrySet()) { + if (k-- == 0) { return; } + if (keyId > edge.getValue()) { //to make sure they will go to the same (next) reducer (of reciprocal) + keyToEmit.set(keyId+","+edge.getValue()); + } else { + keyToEmit.set(edge.getValue()+","+keyId); + } + valueToEmit.set(edge.getKey()); + output.collect(keyToEmit, valueToEmit); + } + + } + +} diff --git a/MetaBlocking/src/main/java/advanced/CNPDriver.java b/MetaBlocking/src/main/java/advanced/CNPDriver.java new file mode 100644 index 0000000..bdd3d7d --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/CNPDriver.java @@ -0,0 +1,90 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + + +public class CNPDriver extends Configured { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(advanced.CNPDriver.class); + + conf.setJobName("CNP from Extended Input"); //used for CNP + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(Text.class); + + conf.setOutputKeyClass(Text.class); + conf.setOutputValueClass(DoubleWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + conf.set("weightingScheme", args[0]); //one of: CBS, ECBS, JS, ARCS + FileInputFormat.setInputPaths(conf, new Path(args[1])); //ExtendedInput + FileOutputFormat.setOutputPath(conf, new Path(args[2])); //CNP + + conf.setMapperClass(advanced.NPMapperNew.class); + conf.setReducerClass(blockingGraphPruning.CNP.class); + + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + conf.setInt("mapred.task.timeout", 10000000); + + conf.setNumReduceTasks(448); + + BufferedReader br = null, br2 = null, br3 = null; + try{ + Path pt=new Path("/user/hduser/BCin.txt"); + Path cleanPath=new Path("/user/hduser/numBlocksClean.txt"); + Path dirtyPath=new Path("/user/hduser/numBlocksDirty.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + br=new BufferedReader(new InputStreamReader(fs.open(pt))); + Float BCin = Float.parseFloat(br.readLine()); + conf.setFloat("BCin", BCin); + br2=new BufferedReader(new InputStreamReader(fs.open(cleanPath))); + Integer cleanBlocks = Integer.parseInt(br2.readLine()); + conf.setInt("cleanBlocks", cleanBlocks); + br3=new BufferedReader(new InputStreamReader(fs.open(dirtyPath))); + Integer dirtyBlocks = Integer.parseInt(br3.readLine()); + conf.setInt("dirtyBlocks", dirtyBlocks); + }catch(Exception e){ + System.err.println(e.toString()); + } finally { + try { br.close(); br2.close();br3.close(); } + catch (IOException e) {System.err.println(e.toString());} + } + + conf.setCompressMapOutput(true); + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/advanced/CNPEJSDriver.java b/MetaBlocking/src/main/java/advanced/CNPEJSDriver.java new file mode 100644 index 0000000..875848c --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/CNPEJSDriver.java @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + + +public class CNPEJSDriver extends Configured { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(advanced.CNPEJSDriver.class); + + conf.setJobName("CNP from Extended Input EJS"); //used for CNP + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(Text.class); + + conf.setOutputKeyClass(Text.class); + conf.setOutputValueClass(DoubleWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //EJSFinal (extended input for EJS) + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //CNP + + conf.setMapperClass(advanced.NPMapperEJS.class); + conf.setReducerClass(blockingGraphPruning.CNP.class); + + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + conf.setInt("mapred.task.timeout", 8000000); + conf.setNumReduceTasks(560); + + BufferedReader br = null, br2 = null; + try{ + Path pt=new Path("/user/hduser/BCin.txt"); + Path pt2 = new Path("/user/hduser/validComparisons.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + br=new BufferedReader(new InputStreamReader(fs.open(pt))); + Float BCin = Float.parseFloat(br.readLine()); + conf.setFloat("BCin", BCin); + br2=new BufferedReader(new InputStreamReader(fs.open(pt2))); + String validComparisons = br2.readLine(); + conf.set("validComparisons", validComparisons); + }catch(Exception e){ + System.err.println(e.toString()); + } finally { + try { br.close(); br2.close(); } + catch (IOException e) {System.err.println(e.toString());} + } + + conf.setCompressMapOutput(true); + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/advanced/NPMapper.java b/MetaBlocking/src/main/java/advanced/NPMapper.java new file mode 100644 index 0000000..454419e --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/NPMapper.java @@ -0,0 +1,160 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + + +import hadoopUtils.MBTools; +import java.io.IOException; +import java.text.DecimalFormat; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +public class NPMapper extends MapReduceBase implements Mapper { + + public enum Weight {WEIGHT_COUNTER}; + public enum OutputData {PURGED_BLOCKS}; + public enum InputData {COMPARISONS}; + + private String weightingScheme; + + private int cleanBlocks; + private int dirtyBlocks; + + private VIntWritable ei = new VIntWritable(); + private VIntWritable ej = new VIntWritable(); + Text iWij = new Text(); + Text jWij = new Text(); + + public void configure (JobConf job) { + weightingScheme = job.get("weightingScheme"); //one of ARCS,CBS,ECBS,JS,EJS + cleanBlocks = job.getInt("cleanBlocks", 0); + dirtyBlocks = job.getInt("dirtyBlocks", 0); + } + + + /** + * input: an extended blocking collection + * @param key block id + * @param value arrays of entity ids in this block (first element), along with the block ids (sorted) that contain them (remaining elements) + * e.g. [1,7,8,9][3,1,8,10] means that in this block belong the entities 1 and 3 and entity 1 is placed in blocks 7,8,9 (sorted) and + * entity 3 is placed in blocks 1,8,10 + * @param output key: entity id i (each of the input values). value: entity id j, weight of ei-ej wij + */ + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + reporter.setStatus("splitting the block "+key); + + Map> entityIndex = new TreeMap<>(); //key is entity id, value is the list of blocks that contain the key + List blocks; + String[] entityIndices = value.toString().split("]"); //each entityIndex is an array with the first element the entity and the rest elements its blocks + for (String tmpEntityIndex : entityIndices) { + if (tmpEntityIndex == null || tmpEntityIndex.length() < 2) {continue;} + tmpEntityIndex = tmpEntityIndex.substring(1); //to remove the initial '[' + String[] idsArray = tmpEntityIndex.split(", "); + int entityId = Integer.parseInt(idsArray[0]); + blocks = new ArrayList<>(idsArray.length-1); //maybe initial capacity is not needed + for (int i=1; i < idsArray.length; ++i) { + blocks.add(Integer.parseInt(idsArray[i])); + } + entityIndex.put(entityId, blocks); + } + + DecimalFormat df = new DecimalFormat("#.###"); //format doubles to keep only first 4 decimal points (saves space) + + //dirty ER + List entities = new ArrayList<>(entityIndex.keySet()); + if (entities.size() < 2) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + } + + //clean-clean ER + /*List D1entities = new ArrayList<>(); + List D2entities = new ArrayList<>(); + for (int entity : entityIndex.keySet()) { + if (entity < 0) { + D2entities.add(entity); + } else { + D1entities.add(entity); + } + } + if (D1entities.isEmpty() || D2entities.isEmpty()) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + }*/ + + //clean-clean ER + /*int blockId = key.get(); + List blockse1; + List blockse2; + int counter = 0; + int D1size = D1entities.size(); + //TODO: add formatting, to skip many decimal digits in weight string + + for (int e1 : D1entities) { + reporter.setStatus(++counter+"/"+D1size); + blockse1 = entityIndex.get(e1); + for (int e2 : D2entities) { + blockse2 = entityIndex.get(e2); + if (!MBTools.isRepeated(blockse1, blockse2, blockId, weightingScheme)) { + Double weight = MBTools.getWeight(blockse1, blockse2, blockId, weightingScheme, cleanBlocks, 0); + Double weightToEmit = Double.parseDouble(df.format(weight)); + ei.set(e1); + jWij.set(e2+","+weightToEmit); + output.collect(ei, jWij); + + ej.set(e2); + iWij.set(e1+","+weightToEmit); + output.collect(ej, iWij); + } + } + }*/ + + //dirty ER + int blockId = key.get(); + List blockse1; + List blockse2; + int counter = 0; + Integer []entitiesArray = new Integer[entities.size()]; + entitiesArray = entities.toArray(entitiesArray); + int blockSize = entitiesArray.length; + + long numEntities = entities.size(); + + reporter.incrCounter(InputData.COMPARISONS, (numEntities * (numEntities-1))/2); + + for (int i = 0; i < blockSize-1; ++i) { + int e1 = entitiesArray[i]; + reporter.setStatus(++counter+"/"+blockSize); + blockse1 = entityIndex.get(e1); + for (int j = i+1; j < blockSize; ++j) { + int e2 = entitiesArray[j]; + blockse2 = entityIndex.get(e2); + if (!MBTools.isRepeated(blockse1, blockse2, blockId)) { + Double weight = MBTools.getWeight(blockse1, blockse2, blockId, weightingScheme, dirtyBlocks); + Double weightToEmit = Double.parseDouble(df.format(weight)); + ei.set(e1); + jWij.set(e2+","+weightToEmit); + output.collect(ei, jWij); + + ej.set(e2); + iWij.set(e1+","+weightToEmit); + output.collect(ej, iWij); + } + } + } + } + + +} diff --git a/MetaBlocking/src/main/java/advanced/NPMapperEJS.java b/MetaBlocking/src/main/java/advanced/NPMapperEJS.java new file mode 100644 index 0000000..529a5b6 --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/NPMapperEJS.java @@ -0,0 +1,159 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import hadoopUtils.MBTools; +import java.io.IOException; +import java.text.DecimalFormat; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +public class NPMapperEJS extends MapReduceBase implements Mapper { + + public enum OutputData {PURGED_BLOCKS}; + + private long validComparisons; + + private VIntWritable ei = new VIntWritable(); + private VIntWritable ej = new VIntWritable(); + Text iWij = new Text(); + Text jWij = new Text(); + + public void configure(JobConf job) { + validComparisons = Long.parseLong(job.get("validComparisons")); //one of ARCS,CBS,ECBS,JS,EJS + } + + + /** + * input: an extended blocking collection + * @param key block id + * @param value arrays of entity ids in this block (first element), along with the block ids (sorted) that contain them (remaining elements) and the cardinality (#non-redundatnt comparisons) of this entity + * e.g. [1,7,8,9,4][3,1,8,10,10] means that in this block belong the entities 1 and 3 and entity 1 is placed in blocks 7,8,9 (sorted) and + * entity 3 is placed in blocks 1,8,10 and entity 1 participates in 4 comparisons and entity 3 participates in 10 comparisons + * @param output key: entity id (each of the input values). value: entity ids separated by " " (neighbors of output key) + */ + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + reporter.setStatus("splitting the block "+key); + + Map> entityIndex = new TreeMap<>(); //key is entity id, value is the list of blocks that contain the key + List blocks; + String[] entityIndices = value.toString().split("]"); //each entityIndex is an array with the first element the entity and the rest elements its blocks + for (String tmpEntityIndex : entityIndices) { + if (tmpEntityIndex == null || tmpEntityIndex.length() < 2) {continue;} + tmpEntityIndex = tmpEntityIndex.substring(1); //to remove the initial '[' + String[] idsArray = tmpEntityIndex.split(", "); + int entityId = Integer.parseInt(idsArray[0]); + blocks = new ArrayList<>(idsArray.length-1); //maybe initial capacity is not needed + for (int i=1; i < idsArray.length; ++i) { + blocks.add(Integer.parseInt(idsArray[i])); + } + entityIndex.put(entityId, blocks); //the last id in blocks is actually the cardinality of entity + } + + + DecimalFormat df = new DecimalFormat("#.###"); //format doubles to keep only first 4 decimal points (saves space) + + //dirty ER + List entities = new ArrayList<>(entityIndex.keySet()); + if (entities.size() < 2) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + } + /* + //clean-clean ER + List D1entities = new ArrayList<>(); + List D2entities = new ArrayList<>(); + for (int entity : entityIndex.keySet()) { + if (entity < 0) { + D2entities.add(entity); + } else { + D1entities.add(entity); + } + } + if (D1entities.isEmpty() || D2entities.isEmpty()) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + } + + + + //clean-clean ER + int blockId = key.get(); + List allValuesE1; + List allValuesE2; + List blockse1; + List blockse2; + int counter = 0; + int D1size = D1entities.size(); + + for (int e1 : D1entities) { + reporter.setStatus(++counter+"/"+D1size); + allValuesE1 = entityIndex.get(e1); //the last value is the cardinality of e1 + blockse1 = allValuesE1.subList(0, allValuesE1.size()-1); //the last value is the cardinality of e1 + for (int e2 : D2entities) { + allValuesE2 = entityIndex.get(e2); //the last value is the cardinality of e2 + blockse2 = allValuesE2.subList(0, allValuesE2.size()-1); //the last value is the cardinality of e2 + if (!MBTools.isRepeated(blockse1, blockse2, blockId)) { + Double weight = MBTools.getWeight(allValuesE1, allValuesE2, blockId, "EJS", 0, validComparisons); + weight = Double.parseDouble(df.format(weight)); + ei.set(e1); + jWij.set(e2+","+weight); + output.collect(ei, jWij); + + ej.set(e2); + iWij.set(e1+","+weight); + output.collect(ej, iWij); + } + } + } + */ + //dirty ER + int blockId = key.get(); + List allValuesE1; + List allValuesE2; + List blockse1; + List blockse2; + int counter = 0; + Integer []entitiesArray = new Integer[entities.size()]; + entitiesArray = entities.toArray(entitiesArray); + int blockSize = entitiesArray.length; + + for (int i = 0; i < blockSize-1; ++i) { + int e1 = entitiesArray[i]; + reporter.setStatus(++counter+"/"+blockSize); + allValuesE1 = entityIndex.get(e1); //the last value is the cardinality of e1 + blockse1 = allValuesE1.subList(0, allValuesE1.size()-1); //the last value is the cardinality of e1 + for (int j = i+1; j < blockSize; ++j) { + int e2 = entitiesArray[j]; + allValuesE2 = entityIndex.get(e2); //the last value is the cardinality of e2 + blockse2 = allValuesE2.subList(0, allValuesE2.size()-1); //the last value is the cardinality of e2 + + if (!MBTools.isRepeated(blockse1, blockse2, blockId)) { + Double weight = MBTools.getWeight(allValuesE1, allValuesE2, blockId, "EJS", 0, validComparisons); + weight = Double.parseDouble(df.format(weight)); + ei.set(e1); + jWij.set(e2+","+weight); + output.collect(ei, jWij); + + ej.set(e2); + iWij.set(e1+","+weight); + output.collect(ej, iWij); + } + } + } + } + + +} diff --git a/MetaBlocking/src/main/java/advanced/NPMapperFromCompressed.java b/MetaBlocking/src/main/java/advanced/NPMapperFromCompressed.java new file mode 100644 index 0000000..ba4709b --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/NPMapperFromCompressed.java @@ -0,0 +1,153 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import hadoopUtils.MBTools; +import java.io.IOException; +import java.text.DecimalFormat; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +import preprocessing.VIntArrayWritable; + +public class NPMapperFromCompressed extends MapReduceBase implements Mapper { + + public enum Weight {WEIGHT_COUNTER}; + public enum OutputData {PURGED_BLOCKS}; + + private String weightingScheme; + + private VIntWritable ei = new VIntWritable(); + private VIntWritable ej = new VIntWritable(); + Text iWij = new Text(); + Text jWij = new Text(); + + public void configure (JobConf job) { + weightingScheme = job.get("weightingScheme"); //one of ARCS,CBS,ECBS,JS,EJS + } + + + /** + * input: a blocking collection + * @param key block id + * @param value arrays of entity ids in this block (first element), along with the block ids (sorted) that contain them (remaining elements) + * e.g. [1,7,8,9][3,1,8,10] means that in this block belong the entities 1 and 3 and entity 1 is placed in blocks 7,8,9 (sorted) and + * entity 3 is placed in blocks 1,8,10 + * @param output key: entity id (each of the input values). value: entity ids separated by " " (neighbors of output key) + */ + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + reporter.setStatus("splitting the block "+key); + + Map entityIndex = new TreeMap<>(); //key is entity id, value is the list of blocks that contain the key + Integer[] blocks; + String[] entityIndices = value.toString().split("]"); //each entityIndex is an array with the first element the entity and the rest elements its blocks + for (String tmpEntityIndex : entityIndices) { + if (tmpEntityIndex == null || tmpEntityIndex.length() < 2) {continue;} + tmpEntityIndex = tmpEntityIndex.substring(1); //to remove the initial '[' + String[] idsArray = tmpEntityIndex.split(", "); + int entityId = Integer.parseInt(idsArray[0]); + blocks = new Integer[idsArray.length-1]; + for (int i=1; i < idsArray.length; ++i) { + blocks[i-1] = Integer.parseInt(idsArray[i]); + } + //VIntArrayWritable compressed = new VIntArrayWritable(blocks); + //entityIndex.put(entityId, compressed); + //entityIndex.put(entityId, hadoopUtils.RelativePositionCompression.uncompress(compressed)); + entityIndex.put(entityId, hadoopUtils.RelativePositionCompression.uncompress(blocks)); + } + + //dirty ER + List entities = new ArrayList<>(entityIndex.keySet()); + if (entities.size() < 2) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + } + + /*//clean-clean ER + List D1entities = new ArrayList<>(); + List D2entities = new ArrayList<>(); + for (int entity : entityIndex.keySet()) { + if (entity < 0) { + D2entities.add(entity); + } else { + D1entities.add(entity); + } + } + if (D1entities.isEmpty() || D2entities.isEmpty()) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + }*/ + + //clean-clean ER + /*int blockId = key.get(); + List blockse1; + List blockse2; + int counter = 0; + int D1size = D1entities.size(); + + for (int e1 : D1entities) { + reporter.setStatus(++counter+"/"+D1size); + blockse1 = entityIndex.get(e1); + for (int e2 : D2entities) { + blockse2 = entityIndex.get(e2); + if (!MBTools.isRepeated(blockse1, blockse2, blockId)) { + Double weight = MBTools.getWeight(blockse1, blockse2, blockId, weightingScheme); + ei.set(e1); + jWij.set(e2+","+weight); + output.collect(ei, jWij); + + ej.set(e2); + iWij.set(e1+","+weight); + output.collect(ej, iWij); + } + } + }*/ + + //dirty ER + int blockId = key.get(); + Integer[] blockse1; + Integer[] blockse2; + int counter = 0; + Integer []entitiesArray = new Integer[entities.size()]; + entitiesArray = entities.toArray(entitiesArray); + int blockSize = entitiesArray.length; + DecimalFormat df = new DecimalFormat("#.####"); //format doubles to keep only first 4 decimal points (saves space) + for (int i = 0; i < blockSize-1; ++i) { + int e1 = entitiesArray[i]; + reporter.setStatus(++counter+"/"+blockSize); + //blockse1 = hadoopUtils.RelativePositionCompression.uncompress(entityIndex.get(e1)).get(); + blockse1 = entityIndex.get(e1); + for (int j = i+1; j < blockSize; ++j) { + int e2 = entitiesArray[j]; + //blockse2 = hadoopUtils.RelativePositionCompression.uncompress(entityIndex.get(e2)).get(); + blockse2 = entityIndex.get(e2); + if (!MBTools.isRepeated(blockse1, blockse2, blockId)) { + Double weight = MBTools.getWeight(Arrays.asList(blockse1), Arrays.asList(blockse2), blockId, weightingScheme, 0, 0); + String weightString = df.format(weight); + ei.set(e1); + jWij.set(e2-e1+","+weightString); //FIXME: set 'e2' to 'e2-e1' for compression and then update the reducer accordingly + output.collect(ei, jWij); + + ej.set(e2); + iWij.set(e1-e2+","+weightString); + output.collect(ej, iWij); + } + } + } + } + + +} diff --git a/MetaBlocking/src/main/java/advanced/NPMapperNew.java b/MetaBlocking/src/main/java/advanced/NPMapperNew.java new file mode 100644 index 0000000..de10ebf --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/NPMapperNew.java @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + + +import hadoopUtils.MBTools; +import java.io.IOException; +import java.text.DecimalFormat; + +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +public class NPMapperNew extends MapReduceBase implements Mapper { + + public enum Weight {WEIGHT_COUNTER}; + public enum OutputData {PURGED_BLOCKS}; + public enum InputData {COMPARISONS}; + + private String weightingScheme; + + private int cleanBlocks; + private int dirtyBlocks; + + private VIntWritable ei = new VIntWritable(); + private VIntWritable ej = new VIntWritable(); + Text iWij = new Text(); + Text jWij = new Text(); + + public void configure (JobConf job) { + weightingScheme = job.get("weightingScheme"); //one of ARCS,CBS,ECBS,JS,EJS + cleanBlocks = job.getInt("cleanBlocks", 0); + dirtyBlocks = job.getInt("dirtyBlocks", 0); + } + + + /** + * input: an extended blocking collection + * @param key block id + * @param value arrays of entity ids in this block (first element), along with the block ids (sorted) that contain them (remaining elements) + * e.g. [1,7,8,9][3,1,8,10] means that in this block belong the entities 1 and 3 and entity 1 is placed in blocks 7,8,9 (sorted) and + * entity 3 is placed in blocks 1,8,10 + * @param output key: entity id (each of the input values). value: entity ids separated by " " (neighbors of output key) + */ + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + reporter.setStatus("splitting the block "+key); + + String[] entityIndices = value.toString().split("]"); //each entityIndex is an array with the first element the entity and the rest elements its blocks + + int counter = 0; + int noOfEntities = entityIndices.length; + //dirty ER +// if (noOfEntities < 2) { //TODO:is it correct ?????? +// reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); +// return; +// } + + int[] entityIds = new int[noOfEntities]; + int[][] entityBlocks = new int[noOfEntities][]; + for (String tmpEntityIndex : entityIndices) { +// if (tmpEntityIndex == null || tmpEntityIndex.length() < 2) {continue;} //GINETAI NA BGALOUME AUTO TON ELEGXO? + String[] idsArray = tmpEntityIndex.split(", "); + entityIds[counter] = Integer.parseInt(idsArray[0].substring(1)); //to skip '[' + + int noOfBlocks = idsArray.length-1; + entityBlocks[counter] = new int[noOfBlocks]; + for (int i=0; i < noOfBlocks; ++i) { + entityBlocks[counter][i] = Integer.parseInt(idsArray[i+1]); + } + counter++; + } + + DecimalFormat df = new DecimalFormat("#.###"); //format doubles to keep only first 4 decimal points (saves space) + + + //dirty ER + int blockId = key.get(); + + reporter.incrCounter(InputData.COMPARISONS, (noOfEntities * (noOfEntities-1))/2); + + for (int i = 0; i < noOfEntities-1; ++i) { + int e1 = entityIds[i]; + reporter.setStatus(++counter+"/"+noOfEntities); + for (int j = i+1; j < noOfEntities; ++j) { + double weight = MBTools.getWeight(blockId, entityBlocks[i], entityBlocks[j], weightingScheme, dirtyBlocks); + if (weight > 0) { + int e2 = entityIds[j]; + Double weightToEmit = Double.parseDouble(df.format(weight)); + ei.set(e1); + jWij.set(e2+","+weightToEmit); + output.collect(ei, jWij); + + ej.set(e2); + iWij.set(e1+","+weightToEmit); + output.collect(ej, iWij); + } + } + } + } + + +} diff --git a/MetaBlocking/src/main/java/advanced/NPMapperNewFromCompressed.java b/MetaBlocking/src/main/java/advanced/NPMapperNewFromCompressed.java new file mode 100644 index 0000000..82eb866 --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/NPMapperNewFromCompressed.java @@ -0,0 +1,168 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + + +import hadoopUtils.MBTools; +import java.io.IOException; +import java.text.DecimalFormat; +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +import preprocessing.VIntArrayWritable; + +public class NPMapperNewFromCompressed extends MapReduceBase implements Mapper { + + public enum Weight {WEIGHT_COUNTER}; + public enum OutputData {PURGED_BLOCKS}; + public enum InputData {COMPARISONS}; + + private String weightingScheme; + + private final VIntWritable DELIM = new VIntWritable(Integer.MIN_VALUE); + + private int cleanBlocks; + private int dirtyBlocks; + + Text iWij = new Text(); + Text jWij = new Text(); + + public void configure (JobConf job) { + weightingScheme = job.get("weightingScheme"); //one of ARCS,CBS,ECBS,JS,EJS + cleanBlocks = job.getInt("cleanBlocks", 0); + dirtyBlocks = job.getInt("dirtyBlocks", 0); + } + + + /** + * input: an extended blocking collection + * @param key block id + * @param value arrays of entity ids in this block (first element), along with the block ids (sorted) that contain them (remaining elements) + * e.g. [1,7,8,9][3,1,8,10] means that in this block belong the entities 1 and 3 and entity 1 is placed in blocks 7,8,9 (sorted) and + * entity 3 is placed in blocks 1,8,10 + * @param output key: entity id (each of the input values). value: entity ids separated by " " (neighbors of output key) + */ + public void map(VIntWritable key, VIntArrayWritable value, + OutputCollector output, Reporter reporter) throws IOException { + reporter.setStatus("splitting the block "+key); + + VIntWritable[] inputArray = value.get(); + + int noOfEntities = 0; + int lastIndex = 0; + List entityIndices = new ArrayList<>(); + int i; + for (i = 0; i < inputArray.length; ++i){ + if (inputArray[i].equals(DELIM)) { + VIntWritable[] tmpEntityIndex = new VIntWritable[i-lastIndex]; + noOfEntities++; + System.arraycopy(inputArray, lastIndex, tmpEntityIndex, 0, i-lastIndex); + entityIndices.add(tmpEntityIndex); + lastIndex = i+1; //the index of the first element (entity Id) of the new array + } + } + int counter = 0; + //dirty ER +// if (noOfEntities == 0) { +// reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); +// return; +// } + + //do the same for the last entity Index + VIntWritable[] lastEntityIndex = new VIntWritable[i-lastIndex]; + noOfEntities++; + System.arraycopy(inputArray, lastIndex, lastEntityIndex, 0, i-lastIndex); + entityIndices.add(lastEntityIndex); + + + VIntWritable[] entityIds = new VIntWritable[noOfEntities]; + int[][] entityBlocks = new int[noOfEntities][]; + for (VIntWritable[] tmpEntityIndex : entityIndices) { + //if (tmpEntityIndex == null || tmpEntityIndex.length() < 2) {continue;} GINETAI NA BGALOUME AUTO TON ELEGXO? + entityIds[counter] = tmpEntityIndex[0]; + + int noOfBlocks = tmpEntityIndex.length-1; + entityBlocks[counter] = new int[noOfBlocks]; + for (i=0; i < noOfBlocks; ++i) { + entityBlocks[counter][i] = tmpEntityIndex[i+1].get(); + } + counter++; + } + + DecimalFormat df = new DecimalFormat("#.###"); //format doubles to keep only first 4 decimal points (saves space) + + //clean-clean ER + /*List D1entities = new ArrayList<>(); + List D2entities = new ArrayList<>(); + for (int entity : entityIndex.keySet()) { + if (entity < 0) { + D2entities.add(entity); + } else { + D1entities.add(entity); + } + } + if (D1entities.isEmpty() || D2entities.isEmpty()) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + }*/ + + //clean-clean ER + /*int blockId = key.get(); + List blockse1; + List blockse2; + int counter = 0; + int D1size = D1entities.size(); + //TODO: add formatting, to skip many decimal digits in weight string + + for (int e1 : D1entities) { + reporter.setStatus(++counter+"/"+D1size); + blockse1 = entityIndex.get(e1); + for (int e2 : D2entities) { + blockse2 = entityIndex.get(e2); + if (!MBTools.isRepeated(blockse1, blockse2, blockId, weightingScheme)) { + Double weight = MBTools.getWeight(blockse1, blockse2, blockId, weightingScheme, cleanBlocks, 0); + Double weightToEmit = Double.parseDouble(df.format(weight)); + ei.set(e1); + jWij.set(e2+","+weightToEmit); + output.collect(ei, jWij); + + ej.set(e2); + iWij.set(e1+","+weightToEmit); + output.collect(ej, iWij); + } + } + }*/ + + //dirty ER + int blockId = key.get(); + + reporter.incrCounter(InputData.COMPARISONS, (noOfEntities * (noOfEntities-1))/2); + for (i = 0; i < noOfEntities-1; ++i) { + VIntWritable ei = entityIds[i]; + reporter.setStatus(i+1+"/"+noOfEntities); + for (int j = i+1; j < noOfEntities; ++j) { + double weight = MBTools.getWeight(blockId, entityBlocks[i], entityBlocks[j], weightingScheme, dirtyBlocks); + if (weight > 0) { + VIntWritable ej = entityIds[j]; + Double weightToEmit = Double.parseDouble(df.format(weight)); + jWij.set(ej+","+weightToEmit); + output.collect(ei, jWij); + + iWij.set(ei+","+weightToEmit); + output.collect(ej, iWij); + } + } + } + } + + +} diff --git a/MetaBlocking/src/main/java/advanced/PCNPDriver.java b/MetaBlocking/src/main/java/advanced/PCNPDriver.java new file mode 100644 index 0000000..c5d637a --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/PCNPDriver.java @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import java.io.BufferedReader; +import java.io.InputStreamReader; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + + +public class PCNPDriver extends Configured { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(advanced.PCNPDriver.class); + + conf.setJobName("PCNP from Extended Input"); //used for PCNP + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(Text.class); + + conf.setOutputKeyClass(Text.class); + conf.setOutputValueClass(DoubleWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + conf.set("weightingScheme", args[0]); //one of: CBS, ECBS, JS, EJS, ARCS + FileInputFormat.setInputPaths(conf, new Path(args[1])); //ExtendedInput + FileOutputFormat.setOutputPath(conf, new Path(args[2])); //PCNP + + conf.setMapperClass(advanced.PNPMapper.class); + conf.setReducerClass(blockingGraphPruning.CNP.class); + + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + + conf.setNumReduceTasks(336); + + //use the following for CNP and CEPTotalOrder + try{ + Path pt=new Path("/user/hduser/BCin.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(pt))); + Float BCin = Float.parseFloat(br.readLine()); + br.close(); + conf.setFloat("BCin", BCin); + }catch(Exception e){ + System.err.println(e.toString()); + } + + conf.setCompressMapOutput(true); + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/advanced/PNPMapper.java b/MetaBlocking/src/main/java/advanced/PNPMapper.java new file mode 100644 index 0000000..90ac552 --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/PNPMapper.java @@ -0,0 +1,104 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import hadoopUtils.MBTools; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +public class PNPMapper extends MapReduceBase implements Mapper { + + public enum Weight {WEIGHT_COUNTER}; + public enum OutputData {PURGED_BLOCKS}; + + private String weightingScheme; + + private VIntWritable ei = new VIntWritable(); + Text jWij = new Text(); + + public void configure (JobConf job) { + weightingScheme = job.get("weightingScheme"); //one of ARCS,CBS,ECBS,JS,EJS + } + + + /** + * input: a blocking collection + * @param key block id + * @param value arrays of entity ids in this block (first element), along with the block ids (sorted) that contain them (remaining elements) + * e.g. [1,7,8,9][3,1,8,10] means that in this block belong the entities 1 and 3 and entity 1 is placed in blocks 7,8,9 (sorted) and + * entity 3 is placed in blocks 1,8,10 + * @param output key: entity id (each of the input values). value: entity ids separated by " " (neighbors of output key) + */ + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + reporter.setStatus("splitting the block "+key); + + Map> entityIndex = new HashMap<>(); //key is entity id, value is the list of blocks that contain the key + List blocks; + String[] entityIndices = value.toString().split("]"); //each entityIndex is an array with the first element the entity and the rest elements its blocks + for (String tmpEntityIndex : entityIndices) { + if (tmpEntityIndex == null || tmpEntityIndex.length() < 2) {continue;} + tmpEntityIndex = tmpEntityIndex.substring(1); //to remove the initial '[' + String[] idsArray = tmpEntityIndex.split(", "); + int entityId = Integer.parseInt(idsArray[0]); + blocks = new ArrayList<>(idsArray.length-1); //maybe initial capacity is not needed + for (int i=1; i < idsArray.length; ++i) { + blocks.add(Integer.parseInt(idsArray[i])); + } + entityIndex.put(entityId, blocks); + } + + //dirty ER + //List entities = new ArrayList<>(); + + //clean-clean ER + List D1entities = new ArrayList<>(); + List D2entities = new ArrayList<>(); + for (int entity : entityIndex.keySet()) { + if (entity < 0) { + D2entities.add(entity); + } else { + D1entities.add(entity); + } + } + + if (D1entities.isEmpty() || D2entities.isEmpty()) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + } + + + int blockId = key.get(); + List blockse1; + List blockse2; + int counter = 0; + int D1size = D1entities.size(); + + for (int e1 : D1entities) { + reporter.setStatus(++counter+"/"+D1size); + blockse1 = entityIndex.get(e1); + for (int e2 : D2entities) { + blockse2 = entityIndex.get(e2); + if (!MBTools.isRepeated(blockse1, blockse2, blockId)) { + Double weight = MBTools.getWeight(blockse1, blockse2, blockId, weightingScheme, 0, 0); + ei.set(e1); + jWij.set(e2+","+weight); + output.collect(ei, jWij); + } + } + } + } + + +} diff --git a/MetaBlocking/src/main/java/advanced/PWNPDriver.java b/MetaBlocking/src/main/java/advanced/PWNPDriver.java new file mode 100644 index 0000000..cc3ee8b --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/PWNPDriver.java @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + + +public class PWNPDriver extends Configured { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(advanced.PWNPDriver.class); + + conf.setJobName("PWNP from Extended Input"); //used for PWNP + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(Text.class); + + conf.setOutputKeyClass(Text.class); + conf.setOutputValueClass(DoubleWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + conf.set("weightingScheme", args[0]); //one of: CBS, ECBS, JS, EJS, ARCS + FileInputFormat.setInputPaths(conf, new Path(args[1])); //ExtendedInput + FileOutputFormat.setOutputPath(conf, new Path(args[2])); //PWNP + + conf.setMapperClass(advanced.PNPMapper.class); + conf.setReducerClass(blockingGraphPruning.WNP.class); + + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + + conf.setNumReduceTasks(336); + + conf.setCompressMapOutput(true); + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/advanced/WEPDriver.java b/MetaBlocking/src/main/java/advanced/WEPDriver.java new file mode 100644 index 0000000..f973f8d --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/WEPDriver.java @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import java.io.BufferedReader; +import java.io.InputStreamReader; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + + +public class WEPDriver extends Configured { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(advanced.WEPDriver.class); + + conf.setJobName("WEP from Extended Input"); //used for WEP + + conf.setOutputKeyClass(Text.class); + conf.setOutputValueClass(DoubleWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + try{ + Path pt=new Path("/user/hduser/averageWeight.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(pt))); + String weight = br.readLine(); + br.close(); + conf.set("averageWeight", weight); //written from AverageWeight job + }catch(Exception e){ + System.err.println(e.toString()); + } + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //AverageWeight output + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //WEP + + conf.setMapperClass(advanced.WEPMapperOnly.class); + // conf.setReducerClass(advanced.WEPReducer.class); + + conf.set("mapred.max.tracker.failures", "100"); //before it gets black-listed + conf.set("mapred.job.tracker.handler.count", "40"); + + conf.setNumReduceTasks(0); + + //conf.setCompressMapOutput(true); + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/advanced/WEPMapper.java b/MetaBlocking/src/main/java/advanced/WEPMapper.java new file mode 100644 index 0000000..0a69123 --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/WEPMapper.java @@ -0,0 +1,106 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import hadoopUtils.MBTools; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +public class WEPMapper extends MapReduceBase implements Mapper { + + public enum Weight {WEIGHT_COUNTER}; + public enum OutputData {PURGED_BLOCKS}; + + private String weightingScheme; + + private Text keyToEmit = new Text(); + private DoubleWritable valueToEmit = new DoubleWritable(); + + public void configure (JobConf job) { + weightingScheme = job.get("weightingScheme"); //one of ARCS,CBS,ECBS,JS,EJS + } + + + /** + * input: a blocking collection + * @param key block id + * @param value arrays of entity ids in this block (first element), along with the block ids (sorted) that contain them (remaining elements) + * e.g. [1,7,8,9][3,1,8,10] means that in this block belong the entities 1 and 3 and entity 1 is placed in blocks 7,8,9 (sorted) and + * entity 3 is placed in blocks 1,8,10 + * @param output key: entity id (each of the input values). value: entity ids separated by " " (neighbors of output key) + */ + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + reporter.setStatus("splitting the block "+key); + + Map> entityIndex = new HashMap<>(); //key is entity id, value is the list of blocks that contain the key + List blocks; + String[] entityIndices = value.toString().split("]"); //each entityIndex is an array with the first element the entity and the rest elements its blocks + for (String tmpEntityIndex : entityIndices) { + if (tmpEntityIndex == null || tmpEntityIndex.length() < 2) {continue;} + tmpEntityIndex = tmpEntityIndex.substring(1); //to remove the initial '[' + String[] idsArray = tmpEntityIndex.split(", "); + int entityId = Integer.parseInt(idsArray[0]); + blocks = new ArrayList<>(idsArray.length-1); //maybe initial capacity is not needed + for (int i=1; i < idsArray.length; ++i) { + blocks.add(Integer.parseInt(idsArray[i])); + } + entityIndex.put(entityId, blocks); + } + + //dirty ER + //List entities = new ArrayList<>(); + + //clean-clean ER + List D1entities = new ArrayList<>(); + List D2entities = new ArrayList<>(); + for (int entity : entityIndex.keySet()) { + if (entity < 0) { + D2entities.add(entity); + } else { + D1entities.add(entity); + } + } + + if (D1entities.isEmpty() || D2entities.isEmpty()) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + } + + + int blockId = key.get(); + List blockse1; + List blockse2; + int counter = 0; + int D1size = D1entities.size(); + + for (int e1 : D1entities) { + reporter.setStatus(++counter+"/"+D1size); + blockse1 = entityIndex.get(e1); + for (int e2 : D2entities) { + blockse2 = entityIndex.get(e2); + if (!MBTools.isRepeated(blockse1, blockse2, blockId)) { + Double weight = MBTools.getWeight(blockse1, blockse2, blockId, weightingScheme, 0, 0); + reporter.incrCounter(Weight.WEIGHT_COUNTER, new Double(weight*1000).longValue()); + keyToEmit.set(e1+","+e2); + valueToEmit.set(weight); + output.collect(keyToEmit, valueToEmit); + } + } + } + } + + +} diff --git a/MetaBlocking/src/main/java/advanced/WEPMapperOnly.java b/MetaBlocking/src/main/java/advanced/WEPMapperOnly.java new file mode 100644 index 0000000..187f5e9 --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/WEPMapperOnly.java @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import java.io.IOException; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +public class WEPMapperOnly extends MapReduceBase implements Mapper { + + private double averageWeight; + + public enum OutputData {OUTPUT_RECORDS}; + + + public void configure (JobConf job) { + averageWeight = Double.parseDouble(job.get("averageWeight")); + } + + public void map(Text key, DoubleWritable value, + OutputCollector output, Reporter reporter) throws IOException { + reporter.setStatus("splitting the block "+key); + + if (value.get() > averageWeight) { + reporter.incrCounter(OutputData.OUTPUT_RECORDS, 1); +// output.collect(key, value); + } + } + +} diff --git a/MetaBlocking/src/main/java/advanced/WEPReducer.java b/MetaBlocking/src/main/java/advanced/WEPReducer.java new file mode 100644 index 0000000..73a1f42 --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/WEPReducer.java @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import java.io.IOException; +import java.util.Iterator; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.JobID; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.RunningJob; + +import advanced.WEPMapper.Weight; + + +public class WEPReducer extends MapReduceBase implements Reducer { + + double averageWeight; + public void configure(JobConf conf) { + long totalPairs; + long totalWeight; + try { + JobClient client = new JobClient(conf); + RunningJob parentJob = client.getJob(JobID.forName(conf.get("mapred.job.id"))); + totalPairs = parentJob.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", + "MAP_OUTPUT_RECORDS").getCounter(); + totalWeight = parentJob.getCounters().getCounter(Weight.WEIGHT_COUNTER) / 1000; + averageWeight = totalWeight / (double) totalPairs; + } catch (IOException e) { + e.printStackTrace(); + } + } + + public void reduce(Text _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + + DoubleWritable weight = values.next(); //only one value + if (weight.get() > averageWeight) { + output.collect(_key, weight); + } + + + } + +} diff --git a/MetaBlocking/src/main/java/advanced/WNPDriver.java b/MetaBlocking/src/main/java/advanced/WNPDriver.java new file mode 100644 index 0000000..a4fca18 --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/WNPDriver.java @@ -0,0 +1,86 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + + +public class WNPDriver extends Configured { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(advanced.WNPDriver.class); + + conf.setJobName("WNP from Extended Input"); //used for WNP + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(Text.class); + + conf.setOutputKeyClass(Text.class); + conf.setOutputValueClass(DoubleWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + conf.set("weightingScheme", args[0]); //one of: CBS, ECBS, JS, ARCS + FileInputFormat.setInputPaths(conf, new Path(args[1])); //ExtendedInput + FileOutputFormat.setOutputPath(conf, new Path(args[2])); //WNP + + conf.setMapperClass(advanced.NPMapperNew.class); + conf.setReducerClass(blockingGraphPruning.WNP.class); + + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + conf.setInt("mapred.task.timeout", 10000000); + + conf.setNumReduceTasks(448); + + conf.setCompressMapOutput(true); + + BufferedReader br2 = null, br3 = null; + try{ + Path cleanPath=new Path("/user/hduser/numBlocksClean.txt"); + Path dirtyPath=new Path("/user/hduser/numBlocksDirty.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + br2=new BufferedReader(new InputStreamReader(fs.open(cleanPath))); + Integer cleanBlocks = Integer.parseInt(br2.readLine()); + conf.setInt("cleanBlocks", cleanBlocks); + br3=new BufferedReader(new InputStreamReader(fs.open(dirtyPath))); + Integer dirtyBlocks = Integer.parseInt(br3.readLine()); + conf.setInt("dirtyBlocks", dirtyBlocks); + }catch(Exception e){ + System.err.println(e.toString()); + } finally { + try { br2.close();br3.close(); } + catch (IOException e) {System.err.println(e.toString());} + } + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/advanced/WNPEJSDriver.java b/MetaBlocking/src/main/java/advanced/WNPEJSDriver.java new file mode 100644 index 0000000..ed80e72 --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/WNPEJSDriver.java @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + + +public class WNPEJSDriver extends Configured { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(advanced.WNPEJSDriver.class); + + conf.setJobName("WNP from Extended Input EJS"); //used for WNP + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(Text.class); + + conf.setOutputKeyClass(Text.class); + conf.setOutputValueClass(DoubleWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //EJSFinal (extended input for EJS) + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //WNP + + conf.setMapperClass(advanced.NPMapperEJS.class); + conf.setReducerClass(blockingGraphPruning.WNP.class); + + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + conf.setInt("mapred.task.timeout", 8000000); + conf.setNumReduceTasks(560); + + BufferedReader br = null; + try{ + Path pt= new Path("/user/hduser/validComparisons.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + br=new BufferedReader(new InputStreamReader(fs.open(pt))); + String validComparisons = br.readLine(); + conf.set("validComparisons", validComparisons); + }catch(Exception e){ + System.err.println(e.toString()); + } finally { + try { br.close(); } + catch (IOException e) {System.err.println(e.toString());} + } + + conf.setCompressMapOutput(true); + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/advanced/WNPMapper.java b/MetaBlocking/src/main/java/advanced/WNPMapper.java new file mode 100644 index 0000000..e85824b --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/WNPMapper.java @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import hadoopUtils.MBTools; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +public class WNPMapper extends MapReduceBase implements Mapper { + + public enum Weight {WEIGHT_COUNTER}; + public enum OutputData {PURGED_BLOCKS}; + + private String weightingScheme; + + private VIntWritable ei = new VIntWritable(); + private VIntWritable ej = new VIntWritable(); + DoubleWritable weight = new DoubleWritable(); + + public void configure (JobConf job) { + weightingScheme = job.get("weightingScheme"); //one of ARCS,CBS,ECBS,JS,EJS + } + + + /** + * input: a blocking collection + * @param key block id + * @param value arrays of entity ids in this block (first element), along with the block ids (sorted) that contain them (remaining elements) + * e.g. [1,7,8,9][3,1,8,10] means that in this block belong the entities 1 and 3 and entity 1 is placed in blocks 7,8,9 (sorted) and + * entity 3 is placed in blocks 1,8,10 + * @param output key: entity id (each of the input values). value: entity ids separated by " " (neighbors of output key) + */ + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + reporter.setStatus("splitting the block "+key); + + Map> entityIndex = new HashMap<>(); //key is entity id, value is the list of blocks that contain the key + List blocks; + String[] entityIndices = value.toString().split("]"); //each entityIndex is an array with the first element the entity and the rest elements its blocks + for (String tmpEntityIndex : entityIndices) { + if (tmpEntityIndex == null || tmpEntityIndex.length() < 2) {continue;} + tmpEntityIndex = tmpEntityIndex.substring(1); //to remove the initial '[' + String[] idsArray = tmpEntityIndex.split(", "); + int entityId = Integer.parseInt(idsArray[0]); + blocks = new ArrayList<>(idsArray.length-1); //maybe initial capacity is not needed + for (int i=1; i < idsArray.length; ++i) { + blocks.add(Integer.parseInt(idsArray[i])); + } + entityIndex.put(entityId, blocks); + } + + //dirty ER + //List entities = new ArrayList<>(); + + //clean-clean ER + List D1entities = new ArrayList<>(); + List D2entities = new ArrayList<>(); + for (int entity : entityIndex.keySet()) { + if (entity < 0) { + D2entities.add(entity); + } else { + D1entities.add(entity); + } + } + + if (D1entities.isEmpty() || D2entities.isEmpty()) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + } + + + int blockId = key.get(); + List blockse1; + List blockse2; + int counter = 0; + int D1size = D1entities.size(); + + for (int e1 : D1entities) { + reporter.setStatus(++counter+"/"+D1size); + blockse1 = entityIndex.get(e1); + for (int e2 : D2entities) { + blockse2 = entityIndex.get(e2); + if (!MBTools.isRepeated(blockse1, blockse2, blockId)) { + weight.set(MBTools.getWeight(blockse1, blockse2, blockId, weightingScheme, 0, 0)); + ei.set(e1); + output.collect(ei, weight); + ej.set(e2); + output.collect(ej, weight); + } + } + } + } + + +} diff --git a/MetaBlocking/src/main/java/advanced/WNPReducer.java b/MetaBlocking/src/main/java/advanced/WNPReducer.java new file mode 100644 index 0000000..a55eb7f --- /dev/null +++ b/MetaBlocking/src/main/java/advanced/WNPReducer.java @@ -0,0 +1,31 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package advanced; + +import java.io.IOException; +import java.util.Iterator; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +public class WNPReducer extends MapReduceBase implements Reducer { + + DoubleWritable averageWeight = new DoubleWritable(); + + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + double totalWeight = 0; + int valuesCounter = 0; + while (values.hasNext()) { + totalWeight += values.next().get(); + valuesCounter++; + } + averageWeight.set(totalWeight / valuesCounter); + output.collect(_key, averageWeight); + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphBuilding/ARCS.java b/MetaBlocking/src/main/java/blockingGraphBuilding/ARCS.java new file mode 100644 index 0000000..832d874 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphBuilding/ARCS.java @@ -0,0 +1,33 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphBuilding; + +import java.io.IOException; +import java.util.Iterator; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.io.VLongWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +public class ARCS extends MapReduceBase implements Reducer { + + /** + * @param _key i,j entity ids + * @param values list of ||bk||, where bk in Bij (the cardinalities of all common blocks) + * @param output key: same as input key. value: wij (ARCS) = sum_k(1/||bk||) + */ + public void reduce(Text _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + double sum = 0; + while (values.hasNext()) { + sum += 1.0 / values.next().get(); + } + output.collect(_key, new DoubleWritable(sum)); + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphBuilding/ARCSDriver.java b/MetaBlocking/src/main/java/blockingGraphBuilding/ARCSDriver.java new file mode 100644 index 0000000..8861ead --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphBuilding/ARCSDriver.java @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphBuilding; + +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.io.VLongWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +public class ARCSDriver extends Configured { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(blockingGraphBuilding.ARCSDriver.class); + + conf.setJobName("ARCS intermediate"); + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(VIntWritable.class); + + conf.setOutputKeyClass(Text.class); + conf.setOutputValueClass(VLongWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + //conf.setOutputFormat(TextOutputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //input path in HDFS (Entity Index) + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //Blocking Graph ARCS (intermediate) + + conf.setMapperClass(blockingGraphBuilding.ARCSMapper.class); + conf.setReducerClass(blockingGraphBuilding.ARCSReducerDirty.class); + //conf.setReducerClass(blockingGraphBuilding.ARCSReducer.class); + + conf.setCompressMapOutput(true); + + conf.setNumReduceTasks(560); + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + conf.setMaxReduceTaskFailuresPercent(10); + conf.set("mapred.reduce.max.attempts", "30"); + conf.set("mapred.max.tracker.failures", "200"); + conf.set("mapred.job.tracker.handler.count", "40"); + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/blockingGraphBuilding/ARCSMapper.java b/MetaBlocking/src/main/java/blockingGraphBuilding/ARCSMapper.java new file mode 100644 index 0000000..fdd9c2a --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphBuilding/ARCSMapper.java @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphBuilding; + + +import java.io.IOException; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.io.VLongWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +import preprocessing.VIntArrayWritable; + + +public class ARCSMapper extends MapReduceBase implements Mapper { + + static enum InputData {NOT_AN_ENTITY, NULL_PREFIX_ID, MALFORMED_PAIRS}; + + /** + * maps an input entity index into (key, value) pair(s) + * the value is the entity id (input key) along with the num of blocks that contain it + * the key each time is a block id (each element of the input value array) + * @param key an entity id + * @param value an array of block ids that this entity belongs to + * @param output key: a block id (each element of the input value array) - value: the entity id (input key) + */ + public void map(VIntWritable key, VIntArrayWritable value, + OutputCollector output, Reporter reporter) throws IOException { + + VIntWritable [] Bi = value.get(); + for (VIntWritable bi : Bi) { + output.collect(bi, key); + } + + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphBuilding/ARCSReducer.java b/MetaBlocking/src/main/java/blockingGraphBuilding/ARCSReducer.java new file mode 100644 index 0000000..52e2df6 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphBuilding/ARCSReducer.java @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphBuilding; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.io.VLongWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + + +public class ARCSReducer extends MapReduceBase implements Reducer { + + static enum OutputData {PURGED_BLOCKS}; + + VLongWritable bk = new VLongWritable(); + Text comparison = new Text(); + + /** + * @param _key block id + * @param values a list of entity ids that belong to this block + * @param output key: i,j (entity ids) value: ||bk|| (block utility) + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + List D1entities = new ArrayList<>(); + List D2entities = new ArrayList<>(); + //List entities = new ArrayList<>(); //dirty ER + + reporter.setStatus("reducing "+_key); + + while (values.hasNext()) { + //entities.add(values.next().get()); //dirty ER + int entity = values.next().get(); + if (entity >= 0) { + D1entities.add(entity); + } else { + D2entities.add(entity); + } + reporter.progress(); + } +// int blockSize = entities.size(); //dirty ER +// long numComparisons = ((long)blockSize * (blockSize-1)) / 2; //dirty ER + + long numComparisons = (long) D1entities.size() * (long) D2entities.size(); //clean-clean ER + + if (numComparisons == 0) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + } + + bk.set(numComparisons); + + //clean-clean ER (comparisons) + for (int e1 : D1entities) { + for (int e2 : D2entities) { + comparison.set(e1+","+e2); + output.collect(comparison, bk); + //output.collect(new Text(d1+"###"+d2), new VIntWritable(numComparisons)); //(for ARCS) + } + } + + + //dirty ER (comparisons) + //List prevEntities = new ArrayList<>(); + //for (String entity : entities) { + // for (String prevEntity : prevEntities) { + // ... + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphBuilding/ARCSReducerDirty.java b/MetaBlocking/src/main/java/blockingGraphBuilding/ARCSReducerDirty.java new file mode 100644 index 0000000..0eff224 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphBuilding/ARCSReducerDirty.java @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphBuilding; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; + +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.io.VLongWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + + +public class ARCSReducerDirty extends MapReduceBase implements Reducer { + + VLongWritable bk = new VLongWritable(); + Text comparison = new Text(); + + static enum OutputData {PURGED_BLOCKS}; + + /** + * @param _key block id + * @param values a list of entity ids that belong to this block + * @param output key: i,j (entity ids) value: ||bk|| (block utility) + */ + @SuppressWarnings("unchecked") + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + List entities = new ArrayList<>(); //dirty ER + + reporter.setStatus("reducing "+_key); + + while (values.hasNext()) { + entities.add(values.next()); //dirty ER + } + + long numEntities = entities.size(); + + long numComparisons = (long)((long)numEntities * (long)(numEntities-1)) / 2; //dirty ER (for ARCS) + + if (numComparisons == 0) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + } + + bk.set(numComparisons); + + Collections.sort(entities); + + //dirty ER (comparisons) + for (int i = 0; i < numEntities-1; ++i) { + reporter.setStatus(i+"/"+numEntities); + int e1 = entities.get(i).get(); + for (int j=i+1; j < numEntities; ++j) { + int e2 = entities.get(j).get(); + comparison.set(e1+","+e2); + output.collect(comparison, bk); + } + } + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphBuilding/AllBlockComparisonsDriver.java b/MetaBlocking/src/main/java/blockingGraphBuilding/AllBlockComparisonsDriver.java new file mode 100644 index 0000000..d14df64 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphBuilding/AllBlockComparisonsDriver.java @@ -0,0 +1,93 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphBuilding; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.OutputStreamWriter; + +import org.apache.commons.collections.comparators.ReverseComparator; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.Counters; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.JobID; +import org.apache.hadoop.mapred.RunningJob; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + + +import blockingGraphBuilding.AllBlockComparisonsReducer.OutputData; + + +public class AllBlockComparisonsDriver extends Configured { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(blockingGraphBuilding.AllBlockComparisonsDriver.class); + + conf.setJobName("AllBlockComparisons"); + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(Text.class); + + conf.setOutputKeyClass(Text.class); + conf.setOutputValueClass(VIntWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + //conf.setOutputFormat(TextOutputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //input path in HDFS (Entity Index) + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //output path in HDFS (blocks) + + conf.setMapperClass(blockingGraphBuilding.AllBlockComparisonsMapper.class); + conf.setReducerClass(blockingGraphBuilding.AllBlockComparisonsReducer.class); + //conf.setReducerClass(blockingGraphBuilding.AllBlockComparisonsReducerDirty.class); + + + conf.setNumReduceTasks(360); + + client.setConf(conf); + RunningJob job = null; + try { + job = JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + + if (job == null) { + System.err.println("No job found"); + return; + } + + //the following is used only for ECBS but does not create any overhead (keep it always) + try { + Counters counters = job.getCounters(); + long ReduceInputGroups = counters.findCounter("org.apache.hadoop.mapred.Task$Counter", + "REDUCE_INPUT_GROUPS").getCounter(); + long purgedBlocks = counters.findCounter(OutputData.PURGED_BLOCKS).getCounter(); + Long numBlocks = ReduceInputGroups - purgedBlocks; + Path pt=new Path("/user/hduser/numBlocks.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + BufferedWriter br=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true))); + br.write(numBlocks.toString()); + br.close(); + } catch (IllegalArgumentException | IOException e) { + System.err.println(e.toString()); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/blockingGraphBuilding/AllBlockComparisonsDriverBalanced.java b/MetaBlocking/src/main/java/blockingGraphBuilding/AllBlockComparisonsDriverBalanced.java new file mode 100644 index 0000000..178d217 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphBuilding/AllBlockComparisonsDriverBalanced.java @@ -0,0 +1,94 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphBuilding; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.OutputStreamWriter; + +import org.apache.commons.collections.comparators.ReverseComparator; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.Counters; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.JobID; +import org.apache.hadoop.mapred.RunningJob; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + + +import blockingGraphBuilding.AllBlockComparisonsReducer.OutputData; + + +public class AllBlockComparisonsDriverBalanced extends Configured { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(blockingGraphBuilding.AllBlockComparisonsDriverBalanced.class); + + conf.setJobName("AllBlockComparisons Balanced (Dirty)"); + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(Text.class); + + conf.setOutputKeyClass(Text.class); + conf.setOutputValueClass(VIntWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + //conf.setOutputFormat(TextOutputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //input path in HDFS (Entity Index) + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //output path in HDFS (blocks) + + conf.setMapperClass(blockingGraphBuilding.AllBlockComparisonsMapper.class); + //conf.setReducerClass(blockingGraphBuilding.AllBlockComparisonsReducer.class); + conf.setReducerClass(blockingGraphBuilding.AllBlockComparisonsReducerDirty.class); + + conf.setPartitionerClass(blockingGraphBuilding.AllBlockComparisonsParitioner.class); + + conf.setNumReduceTasks(360); + + client.setConf(conf); + RunningJob job = null; + try { + job = JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + + if (job == null) { + System.err.println("No job found"); + return; + } + + //the following is used only for ECBS but does not create any overhead (keep it always) + try { + Counters counters = job.getCounters(); + long ReduceInputGroups = counters.findCounter("org.apache.hadoop.mapred.Task$Counter", + "REDUCE_INPUT_GROUPS").getCounter(); + long purgedBlocks = counters.findCounter(OutputData.PURGED_BLOCKS).getCounter(); + Long numBlocks = ReduceInputGroups - purgedBlocks; + Path pt=new Path("/user/hduser/numBlocks.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + BufferedWriter br=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true))); + br.write(numBlocks.toString()); + br.close(); + } catch (IllegalArgumentException | IOException e) { + System.err.println(e.toString()); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/blockingGraphBuilding/AllBlockComparisonsDriverBalancedAdvanced.java b/MetaBlocking/src/main/java/blockingGraphBuilding/AllBlockComparisonsDriverBalancedAdvanced.java new file mode 100644 index 0000000..89ec72c --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphBuilding/AllBlockComparisonsDriverBalancedAdvanced.java @@ -0,0 +1,179 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphBuilding; + +import hadoopUtils.Partition; +import hadoopUtils.PartitionComparator; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.net.URI; +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.PriorityQueue; +import java.util.Queue; +import java.util.TreeMap; + +import org.apache.commons.collections.comparators.ReverseComparator; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.Counters; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.JobID; +import org.apache.hadoop.mapred.RunningJob; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + + +import blockingGraphBuilding.AllBlockComparisonsReducer.OutputData; + + +public class AllBlockComparisonsDriverBalancedAdvanced extends Configured { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(blockingGraphBuilding.AllBlockComparisonsDriverBalancedAdvanced.class); + + conf.setJobName("AllBlockComparisons Balanced (Dirty)"); + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(Text.class); + + conf.setOutputKeyClass(Text.class); + conf.setOutputValueClass(VIntWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + //conf.setOutputFormat(TextOutputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //input path in HDFS (Entity Index) + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //output path in HDFS (blocks) + + conf.setMapperClass(blockingGraphBuilding.AllBlockComparisonsMapper.class); + //conf.setReducerClass(blockingGraphBuilding.AllBlockComparisonsReducer.class); + conf.setReducerClass(blockingGraphBuilding.AllBlockComparisonsReducerDirty.class); + + + + + //a block is a map entry with key: blockId, value: #comparisons + Map blocks = new LinkedHashMap<>(); //keeps order of insertion (blocks are already sorted descending) + + try{ + Path pt=new Path("/user/hduser/afterFilteringBlockSizes.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(pt))); + + String line; + while ((line = br.readLine()) != null) { + String[] block = line.split("\t"); + int blockId = Integer.parseInt(block[0]); + long blockComparisons = Long.parseLong(block[1]); + blockComparisons = (blockComparisons * (blockComparisons-1) ) / 2; //dirty comparisons + blocks.put(blockId, blockComparisons); + } + br.close(); + }catch(Exception e){ + System.err.println(e.toString()); + } + + + //one partition for the largest block + Map.Entry largestBlock = blocks.entrySet().iterator().next(); + blocks.remove(largestBlock.getKey()); + Partition seedPartition = new Partition(); + seedPartition.addBlock(largestBlock); + + //maximum comparisons per partition + final long partitionComparisons = largestBlock.getValue(); + System.out.println("Partition comparisons\t:\t" + partitionComparisons); + + Queue pq = new PriorityQueue<>(blocks.size(), new PartitionComparator()); + pq.add(seedPartition); + + //Map blockPartitions = new HashMap<>(); //key: blockId, value:partition + + + for (Map.Entry block : blocks.entrySet()) { + Partition smallestPartition = pq.poll(); + long totalComparisons = smallestPartition.getTotalComparisons()+block.getValue(); + if (totalComparisons < partitionComparisons) { //if the new block fits into the smallest partition + smallestPartition.addBlock(block); //add it to the partition + } else { //otherwise create a new partition for the current block + Partition newPartition = new Partition(); + newPartition.addBlock(block); + pq.add(newPartition); + } + pq.add(smallestPartition); + } + + int noOfPartitions = pq.size(); + System.out.println("Total partitions\t:\t" + noOfPartitions); + + + + try{ + Path pt2=new Path("/user/hduser/blockPartitions.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + BufferedWriter bw=new BufferedWriter(new OutputStreamWriter(fs.create(pt2,true))); + + + + //store partitions from largest to smallest + for (int i = noOfPartitions-1; i >= 0; --i) { + Partition partition = pq.poll(); + + String paritionId = Integer.toString(i); + + for (Integer blockId : partition.getBlockIds()) { //write the mapping to a file, that will later be added to the DistributedCache + bw.write(Integer.toString(blockId)); + bw.write("\t"); + bw.write(paritionId); + bw.newLine(); + } + + } + bw.close(); + DistributedCache.addCacheFile(new URI(pt2.toString()), conf); + } catch(Exception e){ + System.err.println(e.toString()); + } + + + + conf.setNumReduceTasks(noOfPartitions); + conf.setPartitionerClass(blockingGraphBuilding.AllBlockComparisonsParitioner.class); + + client.setConf(conf); + RunningJob job = null; + try { + job = JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + + if (job == null) { + System.err.println("No job found"); + return; + } + } + + + +} diff --git a/MetaBlocking/src/main/java/blockingGraphBuilding/AllBlockComparisonsMapper.java b/MetaBlocking/src/main/java/blockingGraphBuilding/AllBlockComparisonsMapper.java new file mode 100644 index 0000000..490a92d --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphBuilding/AllBlockComparisonsMapper.java @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphBuilding; + + +import java.io.IOException; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +import preprocessing.VIntArrayWritable; + + +public class AllBlockComparisonsMapper extends MapReduceBase implements Mapper { + + static enum InputData {NOT_AN_ENTITY, NULL_PREFIX_ID, MALFORMED_PAIRS}; + + Text outputValue = new Text(); + /** + * maps an input entity index into (key, value) pair(s) + * the value is the entity id (input key) along with the num of blocks that contain it + * the key each time is a block id (each element of the input value array) + * @param key an entity id + * @param value an array of block ids that this entity belongs to + * @param output key: a block id (each element of the input value array) - value: the entity id (input key), its entity index size (|Bi|) + * along with the num of blocks that contain it + */ + public void map(VIntWritable key, VIntArrayWritable value, + OutputCollector output, Reporter reporter) throws IOException { + + VIntWritable [] Bi = value.get(); + int BiSize = Bi.length; + for (VIntWritable bi : Bi) { + outputValue.set(key+","+BiSize); + output.collect(bi, outputValue); + } + + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphBuilding/AllBlockComparisonsParitioner.java b/MetaBlocking/src/main/java/blockingGraphBuilding/AllBlockComparisonsParitioner.java new file mode 100644 index 0000000..8113f7e --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphBuilding/AllBlockComparisonsParitioner.java @@ -0,0 +1,93 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphBuilding; + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.Partitioner; + +public class AllBlockComparisonsParitioner implements Partitioner{ + + //private final int MAX_BLOCK_ID = 1499534; //the number of lines in BlockSizes/part-00000 file + + Map blockPartitions; + Path[] localFiles; + + @Override + public void configure(JobConf job) { + blockPartitions = new HashMap<>(); + + BufferedReader SW; + try { + localFiles = DistributedCache.getLocalCacheFiles(job); //for the cluster version + SW = new BufferedReader(new FileReader(localFiles[0].toString())); //for the cluster version + String line; + while ((line = SW.readLine()) != null) { + if (line.trim().isEmpty()) {break;} + String[] block = line.split("\t"); + blockPartitions.put(Integer.parseInt(block[0]), Integer.parseInt(block[1])); + } + SW.close(); + } catch (FileNotFoundException e) { + System.err.println(e.toString()); + } catch (IOException e) { + System.err.println(e.toString()); + } + } + + //option 4 + @Override + public int getPartition(VIntWritable key, Text value, int numPartitions) { + int blockId = key.get(); + return blockPartitions.get(blockId); + } + +// //option 3 +// @Override +// public int getPartition(VIntWritable key, Text value, int numPartitions) { +// int blockId = key.get(); +// int inverseBlockId = MAX_BLOCK_ID - blockId; //the largest block is the one with MAX_BLOCK_ID +// if (inverseBlockId / numPartitions == 0) {// η πρώτη N-άδα +// return inverseBlockId % numPartitions; +// }else { +// return numPartitions-1-inverseBlockId%numPartitions; +// } +// } + + //option 2 +// @Override +// public int getPartition(VIntWritable key, Text value, int numPartitions) { +// int blockId = key.get(); +// int inverseBlockId = MAX_BLOCK_ID - blockId; //the largest block is the one with MAX_BLOCK_ID +// if (inverseBlockId / numPartitions % 2 == 0) {// περιττή Ν-άδα +// return inverseBlockId % numPartitions; +// }else { +// return numPartitions-1-inverseBlockId%numPartitions; +// } +// } + + //option 1 +// @Override +// public int getPartition(VIntWritable key, Text value, int numPartitions) { +// int blockId = key.get(); +// return (MAX_BLOCK_ID - blockId) % numPartitions; +// } + + + + + + + +} diff --git a/MetaBlocking/src/main/java/blockingGraphBuilding/AllBlockComparisonsReducer.java b/MetaBlocking/src/main/java/blockingGraphBuilding/AllBlockComparisonsReducer.java new file mode 100644 index 0000000..6dfa242 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphBuilding/AllBlockComparisonsReducer.java @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphBuilding; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + + +public class AllBlockComparisonsReducer extends MapReduceBase implements Reducer { + + private final static VIntWritable one = new VIntWritable(1); + Text comparison = new Text(); + + static enum OutputData {PURGED_BLOCKS}; + + /** + * groups the "i,|Bi|" pairs (input values) of the block "_key" (input key)
+ * outputs for each comparison i,j + * output key: i,|Bi|,j,|Bj| + * output value: 1 + * output value (for ARCS): num of comparisons in the current block + * @param _key block id + * @param values a list of "i,|Bi|" pairs, where i is an entity id and Bi is the set of blocks that i is placed in
+ * @param output key: i,|Bi|,j,|Bj| value: 1 (for ARCS: value = ||bk||) + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + List D1entities = new ArrayList<>(); + List D2entities = new ArrayList<>(); + //List entities = new ArrayList<>(); //dirty ER + + reporter.setStatus("reducing "+_key); + + while (values.hasNext()) { + //entities.add(values.next().toString()); //dirty ER + String entity = values.next().toString(); + Integer entityId = Integer.parseInt(entity.substring(0,entity.indexOf(","))); + if (entityId >= 0) { + D1entities.add(entity); + } else { + D2entities.add(entity); + } + reporter.progress(); + } +// int blockSize = entities.size(); //dirty ER (for ARCS) +// long numComparisons = (blockSize * (blockSize-1)) / 2; //dirty ER (for ARCS) + + long numComparisons = D1entities.size() * D2entities.size(); //clean-clean ER (for ARCS) + + if (numComparisons == 0) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + } + + //clean-clean ER (comparisons) + for (String e1 : D1entities) { + for (String e2 : D2entities) { + comparison.set(e1+","+e2); + output.collect(comparison, one); + //output.collect(new Text(e1+"###"+e2), new VIntWritable(numComparisons)); //(for ARCS) + } + } + + + //dirty ER (comparisons) + //List prevEntities = new ArrayList<>(); + //for (String entity : entities) { + // for (String prevEntity : prevEntities) { + // ... + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphBuilding/AllBlockComparisonsReducerDirty.java b/MetaBlocking/src/main/java/blockingGraphBuilding/AllBlockComparisonsReducerDirty.java new file mode 100644 index 0000000..a4751e9 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphBuilding/AllBlockComparisonsReducerDirty.java @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphBuilding; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + + +public class AllBlockComparisonsReducerDirty extends MapReduceBase implements Reducer { + + private final static VIntWritable one = new VIntWritable(1); + Text comparison = new Text(); + + static enum OutputData {PURGED_BLOCKS}; + + /** + * groups the "i,|Bi|" pairs (input values) of the block "_key" (input key)
+ * outputs for each comparison i,j + * output key: i,|Bi|,j,|Bj| + * output value: 1 + * output value (for ARCS): num of comparisons in the current block + * @param _key block id + * @param values a list of "i,|Bi|" pairs, where i is an entity id and Bi is the set of blocks that i is placed in
+ * @param output key: i,|Bi|,j,|Bj| value: 1 (for ARCS: value = ||bk||) + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + //List D1entities = new ArrayList<>(); + //List D2entities = new ArrayList<>(); + List entities = new ArrayList<>(); //dirty ER + + reporter.setStatus("reducing "+_key); + + while (values.hasNext()) { + entities.add(values.next().toString()); //dirty ER + } +// int blockSize = entities.size(); //dirty ER (for ARCS) + long numComparisons = (entities.size() * (entities.size()-1)) / 2; //dirty ER (for ARCS) + +// long numComparisons = D1entities.size() * D2entities.size(); //clean-clean ER (for ARCS) + + if (numComparisons == 0) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + } + + //clean-clean ER (comparisons) + /*for (String e1 : D1entities) { + for (String e2 : D2entities) { + comparison.set(e1+","+e2); + output.collect(comparison, one); + } + }*/ + + + //dirty ER (comparisons) + for (String e1 : entities) { + int e1val = Integer.parseInt(e1.substring(0,e1.indexOf(","))); + for (String e2 : entities) { + if (Integer.parseInt(e2.substring(0,e2.indexOf(","))) <= e1val) {continue;} + comparison.set(e1+","+e2); + output.collect(comparison, one); + } + } + + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphBuilding/BlockingGraph.java b/MetaBlocking/src/main/java/blockingGraphBuilding/BlockingGraph.java new file mode 100644 index 0000000..7f47b99 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphBuilding/BlockingGraph.java @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphBuilding; + +import java.io.BufferedReader; +import java.io.InputStreamReader; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; +import org.apache.hadoop.mapred.lib.IdentityMapper; + + +public class BlockingGraph { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(blockingGraphBuilding.BlockingGraph.class); + + conf.setJobName("Blocking Graph (JS)"); + + conf.setMapOutputKeyClass(Text.class); + conf.setMapOutputValueClass(VIntWritable.class); + + conf.setOutputKeyClass(Text.class); + conf.setOutputValueClass(DoubleWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + //conf.setOutputFormat(TextOutputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //AllBlockComparisons + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //Blocking Graph weighted + + conf.setMapperClass(IdentityMapper.class); + //conf.setReducerClass(blockingGraphBuilding.CBS.class); +// conf.setReducerClass(blockingGraphBuilding.ECBS.class); + conf.setReducerClass(blockingGraphBuilding.JS.class); + //conf.setReducerClass(blockingGraphBuilding.ARCS.class); //use its own Driver + + conf.setCombinerClass(blockingGraphBuilding.SumCombiner.class); //works for CBS,JS,ECBS + + conf.setCompressMapOutput(true); + + conf.setNumReduceTasks(360); + + //use the following only for ECBS + try{ + Path pt=new Path("/user/hduser/numBlocks.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(pt))); + Long numBlocks = Long.parseLong(br.readLine()); + br.close(); + conf.setLong("numBlocks", numBlocks); + }catch(Exception e){ + System.err.println(e.toString()); + } + + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphBuilding/BlockingGraphARCS.java b/MetaBlocking/src/main/java/blockingGraphBuilding/BlockingGraphARCS.java new file mode 100644 index 0000000..b111fec --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphBuilding/BlockingGraphARCS.java @@ -0,0 +1,64 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphBuilding; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VLongWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; +import org.apache.hadoop.mapred.lib.IdentityMapper; + + +public class BlockingGraphARCS { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(blockingGraphBuilding.BlockingGraphARCS.class); + + conf.setJobName("Blocking Graph (ARCS)"); + + conf.setMapOutputKeyClass(Text.class); + conf.setMapOutputValueClass(VLongWritable.class); + + conf.setOutputKeyClass(Text.class); + conf.setOutputValueClass(DoubleWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + //conf.setOutputFormat(TextOutputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //ARCS intermediate results + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //Blocking Graph weighted (ARCS) + + conf.setMapperClass(IdentityMapper.class); + conf.setReducerClass(blockingGraphBuilding.ARCS.class); + + conf.setCompressMapOutput(true); + + conf.setNumReduceTasks(560); + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + conf.setMaxReduceTaskFailuresPercent(10); + conf.set("mapred.reduce.max.attempts", "30"); + conf.set("mapred.max.tracker.failures", "200"); + conf.set("mapred.job.tracker.handler.count", "40"); + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphBuilding/BlockingGraphEJS.java b/MetaBlocking/src/main/java/blockingGraphBuilding/BlockingGraphEJS.java new file mode 100644 index 0000000..9064696 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphBuilding/BlockingGraphEJS.java @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphBuilding; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; +import org.apache.hadoop.mapred.lib.IdentityMapper; + + +public class BlockingGraphEJS { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(blockingGraphBuilding.BlockingGraphEJS.class); + + conf.setJobName("Blocking Graph (EJS)"); + + conf.setMapOutputKeyClass(Text.class); + conf.setMapOutputValueClass(Text.class); + + conf.setOutputKeyClass(Text.class); + conf.setOutputValueClass(DoubleWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + //conf.setOutputFormat(TextOutputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //EJSReducer intermediate results + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //Blocking Graph weighted (EJSReducer) + + conf.setMapperClass(IdentityMapper.class); + conf.setReducerClass(blockingGraphBuilding.EJS.class); + + conf.setCompressMapOutput(true); + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + conf.setMaxReduceTaskFailuresPercent(10); + conf.set("mapred.reduce.max.attempts", "10"); + conf.set("mapred.max.tracker.failures", "100"); + conf.set("mapred.job.tracker.handler.count", "40"); + + conf.setNumReduceTasks(1120); + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphBuilding/CBS.java b/MetaBlocking/src/main/java/blockingGraphBuilding/CBS.java new file mode 100644 index 0000000..d1afea9 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphBuilding/CBS.java @@ -0,0 +1,33 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphBuilding; + +import java.io.IOException; +import java.util.Iterator; + +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +public class CBS extends MapReduceBase implements Reducer { + + + public void reduce(Text _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + double sum = 0; + while (values.hasNext()) { + sum += values.next().get(); + } + String[] inputKey = _key.toString().split(","); + StringBuffer outputKey = new StringBuffer(inputKey[0]); + outputKey.append(","); + outputKey.append(inputKey[2]); + output.collect(new Text(outputKey.toString()), new DoubleWritable(sum)); + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphBuilding/ECBS.java b/MetaBlocking/src/main/java/blockingGraphBuilding/ECBS.java new file mode 100644 index 0000000..5fb7d6f --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphBuilding/ECBS.java @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphBuilding; + +import java.io.IOException; +import java.util.Iterator; + +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + + +public class ECBS extends MapReduceBase implements Reducer { + + private long B; + public void configure (JobConf job) { + B = job.getLong("numBlocks", 1); + } + + public void reduce(Text _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + double sum = 0; + while (values.hasNext()) { + sum += values.next().get(); + } + + String[] inputKey = _key.toString().split(","); + StringBuffer outputKey = new StringBuffer(inputKey[0]); + outputKey.append(","); + outputKey.append(inputKey[2]); + + Integer Bi = Integer.parseInt(inputKey[1]); + Integer Bj = Integer.parseInt(inputKey[3]); + + double log1 = Math.log10(B/(double)Bi); + double log2 = Math.log10(B/(double)Bj); + + output.collect(new Text(outputKey.toString()), new DoubleWritable(sum*log1*log2)); + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphBuilding/EJS.java b/MetaBlocking/src/main/java/blockingGraphBuilding/EJS.java new file mode 100644 index 0000000..7ed5684 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphBuilding/EJS.java @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphBuilding; + +import java.io.IOException; +import java.text.DecimalFormat; +import java.util.Iterator; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.JobID; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.RunningJob; + +public class EJS extends MapReduceBase implements Reducer { + + DoubleWritable toEmit = new DoubleWritable(); + DecimalFormat df; + + long Eb = 0; + public void configure(JobConf conf) { + try { + JobClient client = new JobClient(conf); + RunningJob parentJob = client.getJob(JobID.forName(conf.get("mapred.job.id"))); + long mapperCounter = parentJob.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", + "MAP_OUTPUT_RECORDS").getCounter(); + Eb = mapperCounter / 2; //mapperCounter is always an even number + } catch (IOException e) { + e.printStackTrace(); + } + df = new DecimalFormat("#.###"); //format doubles to keep only first 3 decimal points (saves space) + } + + /** + * @param key i,j entity ids + * @param value JS.vi and JS.vj (two values always) + * @param output key: i,j value: wij (EJSReducer) + */ + public void reduce(Text _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + String[] value1 = values.next().toString().split(","); + String[] value2 = values.next().toString().split(","); + double JS = Double.parseDouble(value1[0]); + double vi = Double.parseDouble(value1[1]); //to save further casting (int->double) + double vj = Double.parseDouble(value2[1]); //to save further casting (int->double) + double outputValue = JS * Math.log10(Eb / vi) * Math.log10(Eb / vj); + reporter.progress(); + toEmit.set(Double.parseDouble(df.format(outputValue))); +// toEmit.set(outputValue); + output.collect(_key, toEmit); + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphBuilding/EJSDriver.java b/MetaBlocking/src/main/java/blockingGraphBuilding/EJSDriver.java new file mode 100644 index 0000000..f743769 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphBuilding/EJSDriver.java @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphBuilding; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +public class EJSDriver { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(blockingGraphBuilding.EJSDriver.class); + + conf.setJobName("EJSReducer Driver (intermediate results)"); + + conf.setOutputKeyClass(Text.class); + conf.setOutputValueClass(Text.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + //conf.setOutputFormat(TextOutputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //AllBlockComparisons + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //EJSReducer Blocking Graph (intermediate) + + conf.setMapperClass(blockingGraphBuilding.EJSMapper.class); + conf.setReducerClass(blockingGraphBuilding.EJSReducer.class); + + conf.setCompressMapOutput(true); + + conf.setNumReduceTasks(1120); + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + conf.setMaxReduceTaskFailuresPercent(10); + conf.set("mapred.reduce.max.attempts", "30"); + conf.set("mapred.max.tracker.failures", "200"); + conf.set("mapred.job.tracker.handler.count", "40"); + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphBuilding/EJSMapper.java b/MetaBlocking/src/main/java/blockingGraphBuilding/EJSMapper.java new file mode 100644 index 0000000..d81851b --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphBuilding/EJSMapper.java @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphBuilding; + +import java.io.IOException; + +import org.apache.commons.lang.StringUtils; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +public class EJSMapper extends MapReduceBase implements Mapper { + + Text i_Bi = new Text(); + Text j_Bj = new Text(); + /** + * @param key i,|Bi|,j,|Bj| where i,j are enity ids and |Bi|,|Bj| the respective entity index sizes + * @param value 1 (ignore) + * @param output key: i,|Bi| value: j,|Bj| and also the reverse (key:j,|Bj| value: i,|Bi|) + */ + public void map(Text key, VIntWritable value, + OutputCollector output, Reporter reporter) throws IOException { + String[] keyString = key.toString().split(","); + + i_Bi.set(keyString[0]+","+keyString[1]); + j_Bj.set(keyString[2]+","+keyString[3]); + output.collect(i_Bi,j_Bj); + output.collect(j_Bj,i_Bi); + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphBuilding/EJSReducer.java b/MetaBlocking/src/main/java/blockingGraphBuilding/EJSReducer.java new file mode 100644 index 0000000..2d05c4b --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphBuilding/EJSReducer.java @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphBuilding; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; + +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +public class EJSReducer extends MapReduceBase implements Reducer { + + Text comparison = new Text(); + Text JSvi = new Text(); + /** + * @param key i,|Bi| where i is an enity id and |Bi| its entity index size + * @param value list of j,|Bj| (non-distinct) + * @param output key: i,j value: JS,|vi| + */ + public void reduce(Text _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + Map neighbors = new HashMap<>(); //map value is |Bij|,i.e., #common blocks + + String[] keyString = _key.toString().split(","); + Long i = Long.parseLong(keyString[0]); + Integer Bi = Integer.parseInt(keyString[1]); + + while (values.hasNext()) { + String mapKey = values.next().toString(); + Integer mapValue = neighbors.get(mapKey); + if (mapValue == null) { //first time this neighbor appears + mapValue = 0; + } + neighbors.put(mapKey, ++mapValue); + } + + int vi = neighbors.size(); + for (Map.Entry neighbor : neighbors.entrySet()) { + String[] valueString = neighbor.getKey().split(","); + Long j = Long.parseLong(valueString[0]); + Integer Bj = Integer.parseInt(valueString[1]); + Integer Bij = neighbor.getValue(); + double JS = (double) Bij / (Bi+Bj-Bij); + JSvi.set(JS+","+vi); + if (i > j) { //to ensure that both vi and vj will go to the same reduce task in the next job + comparison.set(i+","+j); + } else { + comparison.set(j+","+i); + } + output.collect(comparison, JSvi); + } + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphBuilding/JS.java b/MetaBlocking/src/main/java/blockingGraphBuilding/JS.java new file mode 100644 index 0000000..821e7a1 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphBuilding/JS.java @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphBuilding; + +import java.io.IOException; +import java.util.Iterator; + +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +public class JS extends MapReduceBase implements Reducer { + + + public void reduce(Text _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + double sum = 0; + while (values.hasNext()) { + sum += values.next().get(); + } + String[] inputKey = _key.toString().split(","); + StringBuffer outputKey = new StringBuffer(inputKey[0]); + outputKey.append(","); + outputKey.append(inputKey[2]); + + Integer Bi = Integer.parseInt(inputKey[1]); + Integer Bj = Integer.parseInt(inputKey[3]); + + output.collect(new Text(outputKey.toString()), new DoubleWritable(sum/(Bi+Bj-sum))); + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphBuilding/SumCombiner.java b/MetaBlocking/src/main/java/blockingGraphBuilding/SumCombiner.java new file mode 100644 index 0000000..2627fb9 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphBuilding/SumCombiner.java @@ -0,0 +1,27 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphBuilding; + +import java.io.IOException; +import java.util.Iterator; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +public class SumCombiner extends MapReduceBase implements Reducer { + + //works for CBS,ECBS,JS + public void reduce(Text _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + int sum = 0; + while (values.hasNext()) { + sum += values.next().get(); + } + output.collect(_key, new VIntWritable(sum)); + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphPruning/AverageWeightCombiner.java b/MetaBlocking/src/main/java/blockingGraphPruning/AverageWeightCombiner.java new file mode 100644 index 0000000..d2da264 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphPruning/AverageWeightCombiner.java @@ -0,0 +1,34 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphPruning; + +import java.io.IOException; +import java.util.Iterator; + +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + + +public class AverageWeightCombiner extends MapReduceBase implements Reducer { + + /** + * identity mapper - just keep a counter to sum up weights + * @param key i,j entity ids + * @param value wij the weight of this edge + * @param output identical to intput (identity mapper) + */ + public void reduce(Text key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + double totalWeight = 0; + while (values.hasNext()) { + totalWeight += values.next().get(); + } + output.collect(key, new DoubleWritable(totalWeight)); + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphPruning/AverageWeightDriver.java b/MetaBlocking/src/main/java/blockingGraphPruning/AverageWeightDriver.java new file mode 100644 index 0000000..646fdc9 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphPruning/AverageWeightDriver.java @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphPruning; + +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.TextOutputFormat; + + +public class AverageWeightDriver extends Configured { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(blockingGraphPruning.AverageWeightDriver.class); + + conf.setJobName("Average Edge Weight"); //used for WEP + + conf.setMapOutputKeyClass(Text.class); + conf.setMapOutputValueClass(DoubleWritable.class); + + conf.setOutputKeyClass(DoubleWritable.class); + conf.setOutputValueClass(NullWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(TextOutputFormat.class); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //Blocking Graph + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //Average Edge Weight (one value only) + + conf.setMapperClass(blockingGraphPruning.AverageWeightMapper.class); + conf.setCombinerClass(blockingGraphPruning.AverageWeightCombiner.class); + conf.setReducerClass(blockingGraphPruning.AverageWeightReducer.class); + + conf.setNumReduceTasks(1); + + conf.setCompressMapOutput(true); + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/blockingGraphPruning/AverageWeightMapper.java b/MetaBlocking/src/main/java/blockingGraphPruning/AverageWeightMapper.java new file mode 100644 index 0000000..a43d06d --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphPruning/AverageWeightMapper.java @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphPruning; + +import java.io.IOException; + +import org.apache.hadoop.io.ByteWritable; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + + +public class AverageWeightMapper extends MapReduceBase implements Mapper { + + private Text commonKey = new Text("1"); + /* TODO: check the following alternatives for the common key + byte testByte = 1; + private ByteWritable commonKey = new ByteWritable(testByte); + private VIntWritable commonKey = new VIntWritable(1); + */ + + /** + * identity mapper - just keep a counter to sum up weights + * @param key i,j entity ids + * @param value wij the weight of this edge + * @param output identical to intput (identity mapper) + */ + public void map(Text key, DoubleWritable value, + OutputCollector output, Reporter reporter) throws IOException { + output.collect(commonKey, value); //common key for every + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphPruning/AverageWeightReducer.java b/MetaBlocking/src/main/java/blockingGraphPruning/AverageWeightReducer.java new file mode 100644 index 0000000..c5a31d2 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphPruning/AverageWeightReducer.java @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphPruning; + +import java.io.IOException; +import java.util.Iterator; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.JobID; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.RunningJob; + +public class AverageWeightReducer extends MapReduceBase implements Reducer { + + long totalPairs; + public void configure(JobConf conf) { + try { + JobClient client = new JobClient(conf); + RunningJob parentJob = client.getJob(JobID.forName(conf.get("mapred.job.id"))); + totalPairs = parentJob.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", + "MAP_OUTPUT_RECORDS").getCounter(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + /** + * identity mapper - just keep a counter to sum up weights + * @param key i,j entity ids + * @param value wij the weight of this edge + * @param output identical to input (identity mapper) + */ + public void reduce(Text key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + double totalWeight = 0; + while (values.hasNext()) { + totalWeight += values.next().get(); + } + output.collect(new DoubleWritable(totalWeight/totalPairs), NullWritable.get()); + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphPruning/CEPCombiner.java b/MetaBlocking/src/main/java/blockingGraphPruning/CEPCombiner.java new file mode 100644 index 0000000..a8fec2c --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphPruning/CEPCombiner.java @@ -0,0 +1,31 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphPruning; + +import java.io.IOException; +import java.util.Iterator; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +public class CEPCombiner extends MapReduceBase implements Reducer { + + VIntWritable toEmit = new VIntWritable(); + + //get keys (weights) in descending order + public void reduce(DoubleWritable key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + int sum = 0; + while (values.hasNext()) { + sum += values.next().get(); + } + toEmit.set(sum); + output.collect(key, toEmit); + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphPruning/CEPCountingDriver.java b/MetaBlocking/src/main/java/blockingGraphPruning/CEPCountingDriver.java new file mode 100644 index 0000000..0e8d807 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphPruning/CEPCountingDriver.java @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphPruning; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.TextOutputFormat; + + +public class CEPCountingDriver { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(blockingGraphPruning.CEPCountingDriver.class); + + conf.setJobName("CEP Counting (1rst job)"); + + conf.setMapOutputKeyClass(DoubleWritable.class); + conf.setMapOutputValueClass(VIntWritable.class); + + conf.setOutputKeyClass(DoubleWritable.class); + conf.setOutputValueClass(NullWritable.class); + + conf.setOutputKeyComparatorClass(hadoopUtils.DescendingDoubleComparator.class); //sort doubles in descending order + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(TextOutputFormat.class); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //Blocking Graph + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //minValue and extra (more than k) elements + + conf.setMapperClass(blockingGraphPruning.CEPMapper.class); + conf.setCombinerClass(blockingGraphPruning.CEPCombiner.class); + conf.setReducerClass(blockingGraphPruning.CEPReducer.class); + + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + conf.setNumReduceTasks(1); + + conf.setCompressMapOutput(true); + + try{ + Path pt=new Path("/user/hduser/CEPk.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(pt))); + Integer K = Integer.parseInt(br.readLine()); + br.close(); + conf.setInt("K", K); + System.out.println("K="+K); + }catch(Exception e){ + System.err.println(e.toString()); + } + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/blockingGraphPruning/CEPFinalDriver.java b/MetaBlocking/src/main/java/blockingGraphPruning/CEPFinalDriver.java new file mode 100644 index 0000000..01f9999 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphPruning/CEPFinalDriver.java @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphPruning; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; +import org.apache.hadoop.mapred.TextOutputFormat; + + +public class CEPFinalDriver { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(blockingGraphPruning.CEPFinalDriver.class); + + conf.setJobName("CEP Final"); + + conf.setOutputKeyClass(Text.class); + conf.setOutputValueClass(DoubleWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //Blocking Graph + FileOutputFormat.setOutputPath(conf, new Path(args[2])); //CEP + + try{ + Path pt=new Path(args[1]+"/part-00000"); //CEPCounting + FileSystem fs = FileSystem.get(new Configuration()); + BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(pt))); + String minValue = br.readLine(); + Integer extraElements = ((Double)Double.parseDouble(br.readLine())).intValue(); + br.close(); + conf.set("min", minValue); + conf.setInt("extra", extraElements); + System.out.println("min="+minValue); + System.out.println("extra="+extraElements); + + if (extraElements > 0) { //use a reducer to skip the extra elements + + conf.setMapperClass(blockingGraphPruning.CEPFinalMapper.class); + conf.setReducerClass(blockingGraphPruning.CEPFinalReducer.class); + + conf.setNumReduceTasks(56); + + conf.setMapOutputKeyClass(DoubleWritable.class); + conf.setMapOutputValueClass(Text.class); + } else { //don't use a reducer + conf.setMapperClass(blockingGraphPruning.CEPFinalMapperOnly.class); + conf.setNumReduceTasks(0); + } + + + } catch(Exception e){ + System.err.println(e.toString()); + } + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/blockingGraphPruning/CEPFinalMapper.java b/MetaBlocking/src/main/java/blockingGraphPruning/CEPFinalMapper.java new file mode 100644 index 0000000..ef8529e --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphPruning/CEPFinalMapper.java @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphPruning; + +import java.io.IOException; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + + +public class CEPFinalMapper extends MapReduceBase implements Mapper { + + double minValue; + int extraElements; + public void configure(JobConf conf) { + minValue = Double.parseDouble(conf.get("min", "0")); + extraElements = conf.getInt("extra", 0); + } + + /** + * emit only edges that have value >= minValue (i.e. belong in top k edges) + */ + public void map(Text key, DoubleWritable value, + OutputCollector output, Reporter reporter) throws IOException { + double weight = value.get(); + + if (weight >= minValue) { //edge belongs in top k+extraElements + output.collect(value, key); + } + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphPruning/CEPFinalMapperOnly.java b/MetaBlocking/src/main/java/blockingGraphPruning/CEPFinalMapperOnly.java new file mode 100644 index 0000000..b8a44f3 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphPruning/CEPFinalMapperOnly.java @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphPruning; + +import java.io.IOException; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + + +public class CEPFinalMapperOnly extends MapReduceBase implements Mapper { + + double minValue; + + public void configure(JobConf conf) { + minValue = Double.parseDouble(conf.get("min", "0")); + } + + /** + * emit only edges that have value >= minValue (i.e. belong in top k edges) + */ + public void map(Text key, DoubleWritable value, + OutputCollector output, Reporter reporter) throws IOException { + double weight = value.get(); + + if (weight >= minValue) { //edge belongs in top k + output.collect(key, value); + } + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphPruning/CEPFinalReducer.java b/MetaBlocking/src/main/java/blockingGraphPruning/CEPFinalReducer.java new file mode 100644 index 0000000..fd49a92 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphPruning/CEPFinalReducer.java @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphPruning; + +import java.io.IOException; +import java.util.Iterator; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +public class CEPFinalReducer extends MapReduceBase implements Reducer { + + double minValue; + int extraElements; + public void configure(JobConf conf) { + minValue = Double.parseDouble(conf.get("min", "0")); + extraElements = conf.getInt("extra", 0); + } + + public void reduce(DoubleWritable key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + if (key.get() == minValue) { //edge in topk+extraElements => skip ExtraElements + int counter = 0; + while (values.hasNext()) { //skip extraElements + values.next(); + if (++counter == extraElements) { + break; + } + } + + } + + //output the rest of the edges + while (values.hasNext()) { + output.collect(values.next(), key); + } + + + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphPruning/CEPMapper.java b/MetaBlocking/src/main/java/blockingGraphPruning/CEPMapper.java new file mode 100644 index 0000000..20af5a7 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphPruning/CEPMapper.java @@ -0,0 +1,31 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphPruning; + +import java.io.IOException; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + + +public class CEPMapper extends MapReduceBase implements Mapper { + + VIntWritable one = new VIntWritable(1); + + /** + * output for each input edge both its nodes as keys and the other node and weight as value + * @param key i,j entity ids + * @param value wij the weight of this edge + * @param output key:i value:j,wij and the inverse (key:j value:i,wij) + */ + public void map(Text key, DoubleWritable value, + OutputCollector output, Reporter reporter) throws IOException { + output.collect(value, one); + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphPruning/CEPReducer.java b/MetaBlocking/src/main/java/blockingGraphPruning/CEPReducer.java new file mode 100644 index 0000000..0cce6d0 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphPruning/CEPReducer.java @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphPruning; + +import java.io.IOException; +import java.util.Iterator; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +public class CEPReducer extends MapReduceBase implements Reducer { + + private int k; + private int counter; + + public void configure (JobConf conf) { + k = conf.getInt("K", 1000000); + counter = 0; + } + + //get keys (weights) in descending order + public void reduce(DoubleWritable key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + if (counter < k) { + int numComparisonsWithThisWeight = 0; + while (values.hasNext()) { + numComparisonsWithThisWeight += values.next().get(); + } + + counter += numComparisonsWithThisWeight; + + if (counter >= k) { //entered only once + output.collect(key, NullWritable.get()); //the minimum value (edges with greater value are in top k) + output.collect(new DoubleWritable(counter-k), NullWritable.get()); //#additional elements (how many more than k) + } + } //else we don't care, we have found the top-K scores + + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphPruning/CEPTotalOrder.java b/MetaBlocking/src/main/java/blockingGraphPruning/CEPTotalOrder.java new file mode 100644 index 0000000..d167726 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphPruning/CEPTotalOrder.java @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphPruning; + +import java.io.IOException; + +import org.apache.commons.collections.comparators.ReverseComparator; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.TextOutputFormat; +import org.apache.hadoop.mapred.lib.InputSampler; +import org.apache.hadoop.mapred.lib.InverseMapper; +import org.apache.hadoop.mapred.lib.TotalOrderPartitioner; + + +public class CEPTotalOrder { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(blockingGraphPruning.CEPTotalOrder.class); + + conf.setJobName("CEPTotalOrder"); + + conf.setMapOutputKeyClass(DoubleWritable.class); + conf.setMapOutputValueClass(Text.class); + + conf.setOutputKeyClass(Text.class); + conf.setOutputValueClass(DoubleWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(TextOutputFormat.class); + //conf.setOutputFormat(SequenceFileOutputFormat.class); + //SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //Blocking Graph + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //CEPTotalOrder + + conf.setMapperClass(InverseMapper.class); + conf.setReducerClass(hadoopUtils.InverseReducer.class); + + TotalOrderPartitioner.setPartitionFile(conf, new Path("/user/hduser/_partitions")); + + int numReduceTasks = 161; + + InputSampler.Sampler sampler = + new InputSampler.RandomSampler(0.1, numReduceTasks, numReduceTasks - 1); + + try { + InputSampler.writePartitionFile(conf, sampler); + } catch (IOException e1) { + e1.printStackTrace(); + } + + conf.setPartitionerClass(TotalOrderPartitioner.class); + + conf.setNumReduceTasks(numReduceTasks); + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/blockingGraphPruning/CNP.java b/MetaBlocking/src/main/java/blockingGraphPruning/CNP.java new file mode 100644 index 0000000..167786d --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphPruning/CNP.java @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphPruning; + +import java.io.IOException; +import java.util.Collections; +import java.util.Iterator; +import java.util.Map; +import java.util.TreeMap; + +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +public class CNP extends MapReduceBase implements Reducer { + + public enum Output {NUM_RECORDS}; + + private int k; //for topK + public void configure (JobConf job) { + float BCin = job.getFloat("BCin", 1.0f); + k = ((Double)Math.floor(BCin - 1)).intValue(); + } + + /** + * output for each input node its edges with weight in top k weights + * @param key i entity id + * @param value list of j,wij (entity id, weight of edge i-j) + * @param output key:i,j value:wij for wij in top k weights + */ + public void reduce(VIntWritable key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + //sort neighbors in descending order of weight (key=weigh, value=neighborID) + Map neighbors = new TreeMap<>(Collections.reverseOrder()); + while (values.hasNext()) { + String[] value = values.next().toString().split(","); + Double weight = Double.parseDouble(value[1]); + Integer neighbor = Integer.parseInt(value[0]); + neighbors.put(weight, neighbor); //make sure that it does not overwrite neighbors with the same weight (e.g. use multimap) + } + + //Emit top k edges (k nearest neighbors) + for (Map.Entry edge : neighbors.entrySet()) { + if (k-- == 0) { return; } + if (key.get() >= 0) { //to make sure they will go to the same reducer (reciprocal) + reporter.incrCounter(Output.NUM_RECORDS, 1); //to save space + //output.collect(new Text(key+","+edge.getValue()), new DoubleWritable(edge.getKey())); + } else { + reporter.incrCounter(Output.NUM_RECORDS, 1); //to save space + //output.collect(new Text(edge.getValue()+","+key), new DoubleWritable(edge.getKey())); + } + } + + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphPruning/CNPDriver.java b/MetaBlocking/src/main/java/blockingGraphPruning/CNPDriver.java new file mode 100644 index 0000000..1d40141 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphPruning/CNPDriver.java @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphPruning; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + + +public class CNPDriver { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(blockingGraphPruning.CNPDriver.class); + + conf.setJobName("CNP"); + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(Text.class); + + conf.setOutputKeyClass(Text.class); + conf.setOutputValueClass(DoubleWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //Blocking Graph + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //CNP + + conf.setMapperClass(blockingGraphPruning.NPMapper.class); //common for WNP and CNP + conf.setReducerClass(blockingGraphPruning.CNP.class); + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + conf.setMaxReduceTaskFailuresPercent(10); + conf.set("mapred.reduce.max.attempts", "10"); + conf.set("mapred.max.tracker.failures", "100"); + conf.set("mapred.job.tracker.handler.count", "40"); + + conf.setNumReduceTasks(1120); + + conf.setCompressMapOutput(true); + + //use the following for CNP and CEPTotalOrder + try{ + Path pt=new Path("/user/hduser/BCin.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(pt))); + Float BCin = Float.parseFloat(br.readLine()); + br.close(); + conf.setFloat("BCin", BCin); + }catch(Exception e){ + System.err.println(e.toString()); + } + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/blockingGraphPruning/NPMapper.java b/MetaBlocking/src/main/java/blockingGraphPruning/NPMapper.java new file mode 100644 index 0000000..dcc117c --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphPruning/NPMapper.java @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphPruning; + +import java.io.IOException; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +//common for WNP and CNP +public class NPMapper extends MapReduceBase implements Mapper { + + VIntWritable i = new VIntWritable(); + VIntWritable j = new VIntWritable(); + Text iWij = new Text(); + Text jWij = new Text(); + + /** + * output for each input edge both its nodes as keys and the other node and weight as value + * @param key i,j entity ids + * @param value wij the weight of this edge + * @param output key:i value:j,wij and the inverse (key:j value:i,wij) + */ + public void map(Text key, DoubleWritable value, + OutputCollector output, Reporter reporter) throws IOException { + String[] comparison = key.toString().split(","); + i.set(Integer.parseInt(comparison[0])); + j.set(Integer.parseInt(comparison[1])); + iWij.set(i+","+value); + jWij.set(j+","+value); + + output.collect(i, jWij); + output.collect(j, iWij); + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphPruning/PCNPDriver.java b/MetaBlocking/src/main/java/blockingGraphPruning/PCNPDriver.java new file mode 100644 index 0000000..26c67b3 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphPruning/PCNPDriver.java @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphPruning; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + + +public class PCNPDriver { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(blockingGraphPruning.PCNPDriver.class); + + conf.setJobName("PCNP"); + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(Text.class); + + conf.setOutputKeyClass(Text.class); + conf.setOutputValueClass(DoubleWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //Blocking Graph + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //PCNP + + conf.setMapperClass(blockingGraphPruning.PNPMapper.class); //common for PWNP and PCNP + conf.setReducerClass(blockingGraphPruning.CNP.class); + + conf.setNumReduceTasks(160); + + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + + conf.setCompressMapOutput(true); + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/blockingGraphPruning/PNPMapper.java b/MetaBlocking/src/main/java/blockingGraphPruning/PNPMapper.java new file mode 100644 index 0000000..291a70d --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphPruning/PNPMapper.java @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphPruning; + +import java.io.IOException; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +//common for WNP and CNP +public class PNPMapper extends MapReduceBase implements Mapper { + + VIntWritable i = new VIntWritable(); + VIntWritable j = new VIntWritable(); + //Text iWij = new Text(); + Text jWij = new Text(); + + /** + * output for each input edge both its nodes as keys and the other node and weight as value + * @param key i,j entity ids + * @param value wij the weight of this edge + * @param output key:i value:j,wij and NOT the inverse (key:j value:i,wij) + */ + public void map(Text key, DoubleWritable value, + OutputCollector output, Reporter reporter) throws IOException { + String[] comparison = key.toString().split(","); + i.set(Integer.parseInt(comparison[0])); //assuming i comes from the seed partition + j.set(Integer.parseInt(comparison[1])); + //iWij.set(i+","+value); + jWij.set(j+","+value); + + output.collect(i, jWij); + //output.collect(j, iWij); + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphPruning/PWNPDriver.java b/MetaBlocking/src/main/java/blockingGraphPruning/PWNPDriver.java new file mode 100644 index 0000000..067d1c0 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphPruning/PWNPDriver.java @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphPruning; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + + +public class PWNPDriver { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(blockingGraphPruning.PWNPDriver.class); + + conf.setJobName("PWNP"); + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(Text.class); + + conf.setOutputKeyClass(Text.class); + conf.setOutputValueClass(DoubleWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //Blocking Graph + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //PWNP + + conf.setMapperClass(blockingGraphPruning.PNPMapper.class); //common for PWNP and PCNP + conf.setReducerClass(blockingGraphPruning.WNP.class); + + conf.setNumReduceTasks(160); + + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + + conf.setCompressMapOutput(true); + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/blockingGraphPruning/Reciprocal.java b/MetaBlocking/src/main/java/blockingGraphPruning/Reciprocal.java new file mode 100644 index 0000000..b3ff648 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphPruning/Reciprocal.java @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphPruning; + +import java.io.IOException; +import java.util.Iterator; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +//common for WNP and CNP +public class Reciprocal extends MapReduceBase implements Reducer { + + DoubleWritable weight = new DoubleWritable(); + + /** + * output from the input comparisons (with either one or two values) those with two values + * @param key i,j entity ids (comparison) + * @param value list of one or two wij (weight of edge i-j) + * @param output key:i,j (same as input key) value:wij for the keys that have two values + */ + public void reduce(Text key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + weight = values.next(); //the first value + if (values.hasNext()) { //the edge is reciprocal (two identical values) + output.collect(key, weight); + } + + + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphPruning/ReciprocalDriver.java b/MetaBlocking/src/main/java/blockingGraphPruning/ReciprocalDriver.java new file mode 100644 index 0000000..6609fd4 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphPruning/ReciprocalDriver.java @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphPruning; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; +import org.apache.hadoop.mapred.lib.IdentityMapper; + + +public class ReciprocalDriver { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(blockingGraphPruning.ReciprocalDriver.class); + + conf.setJobName("Reciprocal"); + + conf.setOutputKeyClass(Text.class); + conf.setOutputValueClass(DoubleWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //WNP or CNP + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //Reciprocal + + conf.setMapperClass(IdentityMapper.class); //common for WNP and CNP + conf.setReducerClass(blockingGraphPruning.Reciprocal.class); + + conf.setNumReduceTasks(160); + + conf.setCompressMapOutput(true); + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/blockingGraphPruning/WEP.java b/MetaBlocking/src/main/java/blockingGraphPruning/WEP.java new file mode 100644 index 0000000..e6fa432 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphPruning/WEP.java @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphPruning; + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; + +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + + +public class WEP extends MapReduceBase implements Mapper { + + double avgWeight; + private Path[] localFiles; + + public void configure (JobConf job) { + BufferedReader SW; + try { + localFiles = DistributedCache.getLocalCacheFiles(job); + SW = new BufferedReader(new FileReader(localFiles[0].toString())); + avgWeight = Double.parseDouble(SW.readLine()); + SW.close(); + } catch (FileNotFoundException e) { + System.err.println(e.toString()); + } catch (IOException e) { + System.err.println(e.toString()); + } + } + + /** + * discard all edges with weight lower than the average global weight + * @param key i,j entity ids + * @param value wij the weight of this edge + * @param output identical to intput (identity mapper) for wij > avgWeight + */ + public void map(Text key, DoubleWritable value, + OutputCollector output, Reporter reporter) throws IOException { + if (value.get() > avgWeight) { + output.collect(key, value); + } + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphPruning/WEPDriver.java b/MetaBlocking/src/main/java/blockingGraphPruning/WEPDriver.java new file mode 100644 index 0000000..a51b895 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphPruning/WEPDriver.java @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphPruning; + +import java.net.URI; +import java.net.URISyntaxException; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + + +public class WEPDriver { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(blockingGraphPruning.WEPDriver.class); + + conf.setJobName("WEP"); + + conf.setOutputKeyClass(Text.class); + conf.setOutputValueClass(DoubleWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //Blocking Graph + FileOutputFormat.setOutputPath(conf, new Path(args[2])); //WEP + + conf.setMapperClass(blockingGraphPruning.WEP.class); + + conf.setNumReduceTasks(0); //no reducer + + try { + DistributedCache.addCacheFile(new URI(args[1]+"/"+"part-00000"), conf); //average edge weight + } catch (URISyntaxException e1) { + System.err.println(e1.toString()); + } + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/blockingGraphPruning/WNP.java b/MetaBlocking/src/main/java/blockingGraphPruning/WNP.java new file mode 100644 index 0000000..fdbe3f4 --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphPruning/WNP.java @@ -0,0 +1,65 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphPruning; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + + +//common for WNP and CNP +public class WNP extends MapReduceBase implements Reducer { + + public enum Output {NUM_RECORDS}; + + Text comparison = new Text(); + DoubleWritable weight = new DoubleWritable(); + + /** + * output for each input node its edges with weight above a local threshold (avg threshold) + * @param key i entity id + * @param value list of j,wij (entity id, weight of edge i-j) + * @param output key:i,j value:wij for wij > thresh + */ + public void reduce(VIntWritable key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + double localWeights = 0; + double localThresh; + + Map neighbors = new HashMap<>(); + + //find local thresh + while (values.hasNext()) { + String[] value = values.next().toString().split(","); + Double weight = Double.parseDouble(value[1]); + neighbors.put(Integer.parseInt(value[0]), weight); + localWeights += weight; + } + localThresh = localWeights / neighbors.size(); //the average weight + + //start emitting pruned edges + for (Map.Entry neighbor : neighbors.entrySet()) { + if (neighbor.getValue() > localThresh) { + if (key.get() >= 0) { //to make sure they will go to the same reducer (reciprocal) + comparison.set(key+","+neighbor.getKey()); + } else { + comparison.set(neighbor.getKey()+","+key); + } + weight.set(neighbor.getValue()); + //output.collect(comparison, weight); + reporter.incrCounter(Output.NUM_RECORDS, 1); //to save space + } + } + + } + +} diff --git a/MetaBlocking/src/main/java/blockingGraphPruning/WNPDriver.java b/MetaBlocking/src/main/java/blockingGraphPruning/WNPDriver.java new file mode 100644 index 0000000..f2261ba --- /dev/null +++ b/MetaBlocking/src/main/java/blockingGraphPruning/WNPDriver.java @@ -0,0 +1,64 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package blockingGraphPruning; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + + +public class WNPDriver { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(blockingGraphPruning.WNPDriver.class); + + conf.setJobName("WNP"); + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(Text.class); + + conf.setOutputKeyClass(Text.class); + conf.setOutputValueClass(DoubleWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //Blocking Graph + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //WNP + + conf.setMapperClass(blockingGraphPruning.NPMapper.class); //common for WNP and CNP + conf.setReducerClass(blockingGraphPruning.WNP.class); + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + conf.setMaxReduceTaskFailuresPercent(10); + conf.set("mapred.reduce.max.attempts", "10"); + conf.set("mapred.max.tracker.failures", "100"); + conf.set("mapred.job.tracker.handler.count", "40"); + + conf.setNumReduceTasks(1120); + + conf.setCompressMapOutput(true); + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedDriver.java b/MetaBlocking/src/main/java/entityBased/EntityBasedDriver.java new file mode 100644 index 0000000..dfd3329 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedDriver.java @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.JobID; +import org.apache.hadoop.mapred.RunningJob; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +import preprocessing.VIntArrayWritable; + + +public class EntityBasedDriver { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(preprocessing.EntityIndexDriver.class); + + conf.setJobName("Entity Based"); + + conf.setOutputKeyClass(VIntWritable.class); + conf.setOutputValueClass(VIntArrayWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + //conf.setOutputFormat(TextOutputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //blocking collection + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //entity-based output + + conf.setMapperClass(entityBased.EntityBasedMapperFromCompressed.class); + conf.setReducerClass(entityBased.EntityBasedReducer.class); +// conf.setCombinerClass(entityBased.EntityBasedReducer.class); + + + conf.setNumReduceTasks(224); + + conf.setCompressMapOutput(true); + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + + + BufferedReader br = null; + try{ + Path numEntitiesPath=new Path("/user/hduser/numEntities.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + br=new BufferedReader(new InputStreamReader(fs.open(numEntitiesPath))); + Integer numEntities = Integer.parseInt(br.readLine()); + conf.setInt("numEntities", numEntities); + }catch(Exception e){ + System.err.println(e.toString()); + } finally { + try { br.close(); } + catch (IOException e) {System.err.println(e.toString());} + } + + + client.setConf(conf); + + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} \ No newline at end of file diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedDriverAverageWeight.java b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverAverageWeight.java new file mode 100644 index 0000000..776100f --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverAverageWeight.java @@ -0,0 +1,113 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.net.URI; +import java.net.URISyntaxException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.Counters; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RunningJob; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +import preprocessing.VIntArrayWritable; + + +public class EntityBasedDriverAverageWeight { + + /** + * + * @param args can be 3 or 4 arguments:
+ * args[0] is the weighting scheme
+ * args[1] is the input (the blocking collection after block filtering)
+ * if the weighting scheme (args[0]) is "CBS" then:
+ * args[2] is the output path
+ * else
+ * args[2] is the blocks per entity file path and
+ * args[3] is the output path
+ */ + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(entityBased.EntityBasedDriverAverageWeight.class); + + conf.setJobName("Entity Based Average Weight"); + + conf.setOutputKeyClass(VIntWritable.class); + conf.setOutputValueClass(VIntArrayWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + conf.set("weightingScheme", args[0]); + FileInputFormat.setInputPaths(conf, new Path(args[1])); //blocking collection + + if (args[0].equals("CBS")) { + FileOutputFormat.setOutputPath(conf, new Path(args[2])); //entity-based output + } else { + try { + DistributedCache.addCacheFile(new URI(args[2]), conf); //blocks per entity + } catch (URISyntaxException e1) { + System.err.println(e1.toString()); + } + FileOutputFormat.setOutputPath(conf, new Path(args[3])); //entity-based output + } + + conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNP.class); //Dirty +// conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNPClean.class); //Clean-Clean ER + conf.setReducerClass(entityBased.EntityBasedReducerAverageWeight.class); + + conf.setNumReduceTasks(224); + + conf.setCompressMapOutput(true); + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + + + client.setConf(conf); + RunningJob job = null; + try { + job = JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + + + try { + Counters counters = job.getCounters(); + + double totalWeight = counters.findCounter(entityBased.EntityBasedReducerAverageWeight.Weight.WEIGHT_COUNTER).getCounter() / 1000.0; + long comparisons = counters.findCounter(entityBased.EntityBasedReducerAverageWeight.Weight.NUM_EDGES).getCounter(); + Double averageWeight = totalWeight / comparisons; + Path pt=new Path("/user/hduser/averageWeight.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + BufferedWriter bw=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true))); + bw.write(averageWeight.toString()); + bw.close(); + } catch (IllegalArgumentException | IOException e) { + System.err.println(e.toString()); + } + + + + } + +} \ No newline at end of file diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedDriverAverageWeightARCS.java b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverAverageWeightARCS.java new file mode 100644 index 0000000..102cef1 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverAverageWeightARCS.java @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.OutputStreamWriter; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.Counters; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RunningJob; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +import preprocessing.VIntArrayWritable; + + +public class EntityBasedDriverAverageWeightARCS { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(entityBased.EntityBasedDriverAverageWeightARCS.class); + + conf.setJobName("Entity Based Average Weight ARCS"); + + conf.setOutputKeyClass(VIntWritable.class); + conf.setOutputValueClass(VIntArrayWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //blocking collection + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //entity-based output + + //Dirty ER +// conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNP.class); +// conf.setReducerClass(entityBased.EntityBasedReducerAverageWeightARCSDirty.class); + + //Clean-Clean ER + conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNPARCSClean.class); + conf.setReducerClass(entityBased.EntityBasedReducerAverageWeightARCSClean.class); + + conf.setNumReduceTasks(224); + + conf.setCompressMapOutput(true); + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + + + client.setConf(conf); + RunningJob job = null; + try { + job = JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + + try { + Counters counters = job.getCounters(); + + //clean-clean ER + double totalWeight = counters.findCounter(entityBased.EntityBasedReducerAverageWeightARCSClean.Weight.WEIGHT_COUNTER).getCounter() / 1000.0; + long comparisons = counters.findCounter(entityBased.EntityBasedReducerAverageWeightARCSClean.Weight.NUM_EDGES).getCounter(); + //dirty ER +// double totalWeight = counters.findCounter(entityBased.EntityBasedReducerAverageWeightARCSDirty.Weight.WEIGHT_COUNTER).getCounter() / 1000.0; +// long comparisons = counters.findCounter(entityBased.EntityBasedReducerAverageWeightARCSDirty.Weight.NUM_EDGES).getCounter(); + + Double averageWeight = totalWeight / comparisons; + Path pt=new Path("/user/hduser/averageWeight.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + BufferedWriter bw=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true))); + bw.write(averageWeight.toString()); + bw.close(); + } catch (IllegalArgumentException | IOException e) { + System.err.println(e.toString()); + } + + } + +} \ No newline at end of file diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedDriverAverageWeightEJS.java b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverAverageWeightEJS.java new file mode 100644 index 0000000..5bcbbce --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverAverageWeightEJS.java @@ -0,0 +1,121 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.net.URI; +import java.net.URISyntaxException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.Counters; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RunningJob; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +import preprocessing.VIntArrayWritable; + + +public class EntityBasedDriverAverageWeightEJS { + + /** + * + * @param args should have 3 elements: + * args[0]: input blocking collection + * args[1]: blocks per entity (file to be stored in distributed cache) + * args[2]: output of EJS + * + * comparisons are stored in HDFS from the NodeDegree job + * comparisonsPerEntity are stored in HDFS from the NodeDegree job (after a getmerge and a copyFromLocal) in /user/hduser/nodeDegrees.txt + * BCin is stored in HDFS from the EntityIndex job + */ + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(entityBased.EntityBasedDriverAverageWeightEJS.class); + + conf.setJobName("Entity Based Average Weight EJS"); + + conf.setOutputKeyClass(VIntWritable.class); + conf.setOutputValueClass(VIntArrayWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //blocking collection + FileOutputFormat.setOutputPath(conf, new Path(args[2])); //entity-based output + + +// conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNP.class); //dirty + conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNPClean.class); //clean-clean + conf.setReducerClass(entityBased.EntityBasedReducerAverageWeightEJS.class); + + + conf.setNumReduceTasks(224); + + conf.setCompressMapOutput(true); + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + + + BufferedReader br = null; + try{ + Path pt=new Path("/user/hduser/comparisons.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + br=new BufferedReader(new InputStreamReader(fs.open(pt))); + Long comparisons = Long.parseLong(br.readLine()); + conf.setLong("comparisons", comparisons); + }catch(Exception e){ + System.err.println(e.toString()); + } finally { + try { br.close();} + catch (IOException e) {System.err.println(e.toString());} + } + + try { + DistributedCache.addCacheFile(new URI(args[1]), conf); //blocks per entity + DistributedCache.addCacheFile(new URI("/user/hduser/nodeDegrees.txt"), conf); //comparisons per entity + } catch (URISyntaxException e1) { + System.err.println(e1.toString()); + } + + client.setConf(conf); + RunningJob job = null; + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + + try { + Counters counters = job.getCounters(); + + double totalWeight = counters.findCounter(entityBased.EntityBasedReducerAverageWeightEJS.Weight.WEIGHT_COUNTER).getCounter() / 1000.0; + long comparisons = counters.findCounter(entityBased.EntityBasedReducerAverageWeightEJS.Weight.NUM_EDGES).getCounter(); + Double averageWeight = totalWeight / comparisons; + Path pt=new Path("/user/hduser/averageWeight.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + BufferedWriter bw=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true))); + bw.write(averageWeight.toString()); + bw.close(); + } catch (IllegalArgumentException | IOException e) { + System.err.println(e.toString()); + } + } + +} \ No newline at end of file diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCEP1.java b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCEP1.java new file mode 100644 index 0000000..87a9baa --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCEP1.java @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import java.net.URI; +import java.net.URISyntaxException; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +import preprocessing.VIntArrayWritable; + + +public class EntityBasedDriverCEP1 { + + /** + * + * @param args can be 3 or 4 arguments:
+ * args[0] is the weighting scheme
+ * args[1] is the input (the blocking collection after block filtering)
+ * if the weighting scheme (args[0]) is "CBS" then:
+ * args[2] is the output path
+ * else
+ * args[2] is the blocks per entity file path and
+ * args[3] is the output path
+ */ + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(entityBased.EntityBasedDriverCEP1.class); + + conf.setJobName("Entity Based CEP (Job 1)"); + + conf.setOutputKeyClass(DoubleWritable.class); + conf.setOutputValueClass(VIntWritable.class); + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(VIntArrayWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + conf.set("weightingScheme", args[0]); + FileInputFormat.setInputPaths(conf, new Path(args[1])); //blocking collection + + if (args[0].equals("CBS")) { + FileOutputFormat.setOutputPath(conf, new Path(args[2])); //entity-based output + } else { + try { + DistributedCache.addCacheFile(new URI(args[2]), conf); //blocks per entity + } catch (URISyntaxException e1) { + System.err.println(e1.toString()); + } + FileOutputFormat.setOutputPath(conf, new Path(args[3])); //entity-based output + } + +// conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNP.class); //Dirty + conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNPClean.class); //Clean-Clean ER + conf.setReducerClass(entityBased.EntityBasedReducerCEP.class); + + conf.setNumReduceTasks(728); + + conf.setCompressMapOutput(true); + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + conf.setMaxReduceTaskFailuresPercent(10); + conf.set("mapred.reduce.max.attempts", "20"); + conf.set("mapred.max.tracker.failures", "200"); + conf.set("mapred.job.tracker.handler.count", "40"); + + + client.setConf(conf); + + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} \ No newline at end of file diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCEP1ARCS.java b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCEP1ARCS.java new file mode 100644 index 0000000..c21c4b8 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCEP1ARCS.java @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import java.net.URI; +import java.net.URISyntaxException; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +import preprocessing.VIntArrayWritable; + + +public class EntityBasedDriverCEP1ARCS { + + /** + * + * @param args + * args[0] is the input (the blocking collection after block filtering)
+ * args[1] is the output path
+ */ + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(entityBased.EntityBasedDriverCEP1ARCS.class); + + conf.setJobName("Entity Based CEP (Job 1 ARCS)"); + + conf.setOutputKeyClass(DoubleWritable.class); + conf.setOutputValueClass(VIntWritable.class); + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(VIntArrayWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //blocking collection + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //entity-based output + + + conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNP.class); //Dirty + conf.setReducerClass(entityBased.EntityBasedReducerCEPARCSDirty.class); //Dirty + +// conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNPARCSClean.class); //Clean-Clean ER +// conf.setReducerClass(entityBased.EntityBasedReducerCEPARCSClean.class); //Clean-clean + + conf.setNumReduceTasks(728); + + conf.setCompressMapOutput(true); + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + conf.setMaxReduceTaskFailuresPercent(10); + conf.set("mapred.reduce.max.attempts", "20"); + conf.set("mapred.max.tracker.failures", "200"); + conf.set("mapred.job.tracker.handler.count", "40"); + + + client.setConf(conf); + + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} \ No newline at end of file diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCEP1EJS.java b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCEP1EJS.java new file mode 100644 index 0000000..769cfa8 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCEP1EJS.java @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URI; +import java.net.URISyntaxException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +import preprocessing.VIntArrayWritable; + + +public class EntityBasedDriverCEP1EJS { + + /** + * + * @param args + * args[0] is the input (the blocking collection after block filtering)
+ * args[1] is the blocks per entity file path and
+ * args[2] is the output path
+ */ + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(entityBased.EntityBasedDriverCEP1EJS.class); + + conf.setJobName("Entity Based CEP (Job 1 EJS)"); + + conf.setOutputKeyClass(DoubleWritable.class); + conf.setOutputValueClass(VIntWritable.class); + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(VIntArrayWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //blocking collection + + + BufferedReader br = null; + try{ + Path pt=new Path("/user/hduser/comparisons.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + br=new BufferedReader(new InputStreamReader(fs.open(pt))); + Long comparisons = Long.parseLong(br.readLine()); + conf.setLong("comparisons", comparisons); + }catch(Exception e){ + System.err.println(e.toString()); + } finally { + try { br.close(); } + catch (IOException e) {System.err.println(e.toString());} + } + + + try { + DistributedCache.addCacheFile(new URI(args[1]), conf); //blocks per entity + DistributedCache.addCacheFile(new URI("/user/hduser/nodeDegrees.txt"), conf); //comparisons per entity + } catch (URISyntaxException e1) { + System.err.println(e1.toString()); + } + FileOutputFormat.setOutputPath(conf, new Path(args[2])); //entity-based output + + + conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNP.class); //Dirty +// conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNPClean.class); //Clean-Clean ER + conf.setReducerClass(entityBased.EntityBasedReducerCEPEJS.class); + + conf.setNumReduceTasks(728); + + conf.setCompressMapOutput(true); + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + conf.setMaxReduceTaskFailuresPercent(10); + conf.set("mapred.reduce.max.attempts", "20"); + conf.set("mapred.max.tracker.failures", "200"); + conf.set("mapred.job.tracker.handler.count", "40"); + + + client.setConf(conf); + + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} \ No newline at end of file diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCEP2.java b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCEP2.java new file mode 100644 index 0000000..5d6ece5 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCEP2.java @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.TextOutputFormat; +import org.apache.hadoop.mapred.lib.IdentityMapper; + + +public class EntityBasedDriverCEP2 extends Configured { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(advanced.CEPCountingDriver.class); + + conf.setJobName("Entity Based CEP (Job 2)"); + + conf.setMapOutputKeyClass(DoubleWritable.class); + conf.setMapOutputValueClass(VIntWritable.class); + + conf.setOutputKeyClass(DoubleWritable.class); + conf.setOutputValueClass(NullWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(TextOutputFormat.class); + + conf.setOutputKeyComparatorClass(hadoopUtils.DescendingDoubleComparator.class); //sort doubles in descending order + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //CEP1 + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //minValue and extra (more than k) elements + + conf.setMapperClass(IdentityMapper.class); + conf.setCombinerClass(blockingGraphPruning.CEPCombiner.class); + conf.setReducerClass(blockingGraphPruning.CEPReducer.class); + + conf.setNumReduceTasks(1); + + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + conf.setInt("mapred.task.timeout", 10000000); + conf.setMaxReduceTaskFailuresPercent(10); + conf.set("mapred.reduce.max.attempts", "10"); + conf.set("mapred.max.tracker.failures", "100"); + conf.set("mapred.job.tracker.handler.count", "40"); + + conf.setCompressMapOutput(true); + + BufferedReader br = null, br2 = null, br3 = null; + try { + Path pt=new Path("/user/hduser/CEPk.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + br=new BufferedReader(new InputStreamReader(fs.open(pt))); + Integer K = Integer.parseInt(br.readLine()); + br.close(); + conf.setInt("K", K); + System.out.println("K="+K); + + Path cleanPath=new Path("/user/hduser/numBlocksClean.txt"); + Path dirtyPath=new Path("/user/hduser/numBlocksDirty.txt"); + br2=new BufferedReader(new InputStreamReader(fs.open(cleanPath))); + Integer cleanBlocks = Integer.parseInt(br2.readLine()); + conf.setInt("cleanBlocks", cleanBlocks); + br3=new BufferedReader(new InputStreamReader(fs.open(dirtyPath))); + Integer dirtyBlocks = Integer.parseInt(br3.readLine()); + conf.setInt("dirtyBlocks", dirtyBlocks); + } catch(Exception e){ + System.err.println(e.toString()); + } finally { + try { br.close(); br2.close();br3.close(); } + catch (IOException e) {System.err.println(e.toString());} + } + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCEP3.java b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCEP3.java new file mode 100644 index 0000000..81866d9 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCEP3.java @@ -0,0 +1,121 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URI; +import java.net.URISyntaxException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +import preprocessing.VIntArrayWritable; + + +public class EntityBasedDriverCEP3 { + + /** + * + * @param args can be 3 or 4 arguments:
+ * args[0] is the weighting scheme
+ * args[1] is the input (the blocking collection after block filtering)
+ * if the weighting scheme (args[0]) is "CBS" then:
+ * args[2] is the output path of CEP2
+ * else
+ * args[3] is the blocks per entity file path and
+ * args[4] is the output path
+ */ + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(entityBased.EntityBasedDriverCEP3.class); + + conf.setJobName("Entity Based CEP (Job 3)"); + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(VIntArrayWritable.class); + + conf.setOutputKeyClass(VIntWritable.class); + conf.setOutputValueClass(VIntWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + conf.set("weightingScheme", args[0]); + FileInputFormat.setInputPaths(conf, new Path(args[1])); //blocking collection + + if (args[0].equals("CBS")) { + FileOutputFormat.setOutputPath(conf, new Path(args[3])); //entity-based output + } else { + try { + DistributedCache.addCacheFile(new URI(args[3]), conf); //blocks per entity + } catch (URISyntaxException e1) { + System.err.println(e1.toString()); + } + FileOutputFormat.setOutputPath(conf, new Path(args[4])); //entity-based output + } + + + + BufferedReader br = null; + try{ + Path pt=new Path(args[2]+"/part-00000"); //CEP2 + FileSystem fs = FileSystem.get(new Configuration()); + br=new BufferedReader(new InputStreamReader(fs.open(pt))); + String minValue = br.readLine(); + conf.set("min", minValue); + System.out.println("min="+minValue); + //ignore extra elements for now (do not read next line) + } catch(Exception e){ + System.err.println(e.toString()); + } finally { + try { br.close(); } + catch (IOException e) {System.err.println(e.toString());} + } + + + + + +// conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNP.class); //Dirty + conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNPClean.class); //Clean-Clean ER + conf.setReducerClass(entityBased.EntityBasedReducerCEPFinal.class); + + conf.setNumReduceTasks(224); + + conf.setCompressMapOutput(true); + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + conf.setMaxReduceTaskFailuresPercent(10); + conf.set("mapred.reduce.max.attempts", "10"); + conf.set("mapred.max.tracker.failures", "100"); + conf.set("mapred.job.tracker.handler.count", "40"); + + + client.setConf(conf); + + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} \ No newline at end of file diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCEP3ARCS.java b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCEP3ARCS.java new file mode 100644 index 0000000..2962add --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCEP3ARCS.java @@ -0,0 +1,103 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URI; +import java.net.URISyntaxException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +import preprocessing.VIntArrayWritable; + + +public class EntityBasedDriverCEP3ARCS { + + /** + * + * @param args + * args[0] is the input (the blocking collection after block filtering)
+ * args[1] is the output path
+ */ + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(entityBased.EntityBasedDriverCEP3ARCS.class); + + conf.setJobName("Entity Based CEP (Job 3 ARCS)"); + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(VIntArrayWritable.class); + + conf.setOutputKeyClass(VIntWritable.class); + conf.setOutputValueClass(VIntWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //blocking collection + FileOutputFormat.setOutputPath(conf, new Path(args[2])); //entity-based output + + + BufferedReader br = null; + try{ + Path pt=new Path(args[1]+"/part-00000"); //CEP2 + FileSystem fs = FileSystem.get(new Configuration()); + br=new BufferedReader(new InputStreamReader(fs.open(pt))); + String minValue = br.readLine(); + conf.set("min", minValue); + System.out.println("min="+minValue); + //ignore extra elements for now (do not read next line) + } catch(Exception e){ + System.err.println(e.toString()); + } finally { + try { br.close(); } + catch (IOException e) {System.err.println(e.toString());} + } + + +// conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNP.class); //Dirty +// conf.setReducerClass(entityBased.EntityBasedReducerCEPFinalARCSDirty.class); + + conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNPARCSClean.class); //Clean-Clean ER + conf.setReducerClass(entityBased.EntityBasedReducerCEPFinalARCSClean.class); + + conf.setNumReduceTasks(224); + + conf.setCompressMapOutput(true); + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + conf.setMaxReduceTaskFailuresPercent(10); + conf.set("mapred.reduce.max.attempts", "10"); + conf.set("mapred.max.tracker.failures", "100"); + conf.set("mapred.job.tracker.handler.count", "40"); + + + client.setConf(conf); + + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} \ No newline at end of file diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCEP3EJS.java b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCEP3EJS.java new file mode 100644 index 0000000..036384e --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCEP3EJS.java @@ -0,0 +1,111 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URI; +import java.net.URISyntaxException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +import preprocessing.VIntArrayWritable; + + +public class EntityBasedDriverCEP3EJS { + + /** + * + * @param args + * args[0] is the input (the blocking collection after block filtering)
+ * args[1] is the blocks per entity file path and
+ * args[2] is the output path of CEP2
+ * args[3] is the output path
+ */ + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(entityBased.EntityBasedDriverCEP3EJS.class); + + conf.setJobName("Entity Based CEP (Job 3 EJS)"); + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(VIntArrayWritable.class); + + conf.setOutputKeyClass(VIntWritable.class); + conf.setOutputValueClass(VIntWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //blocking collection + + try { + DistributedCache.addCacheFile(new URI(args[1]), conf); //blocks per entity + DistributedCache.addCacheFile(new URI("/user/hduser/nodeDegrees.txt"), conf); //comparisons per entity + } catch (URISyntaxException e1) { + System.err.println(e1.toString()); + } + FileOutputFormat.setOutputPath(conf, new Path(args[3])); //entity-based output + + + BufferedReader br = null; + try{ + Path pt=new Path(args[2]+"/part-00000"); //CEP2 + FileSystem fs = FileSystem.get(new Configuration()); + br=new BufferedReader(new InputStreamReader(fs.open(pt))); + String minValue = br.readLine(); + conf.set("min", minValue); + System.out.println("min="+minValue); + //ignore extra elements for now (do not read next line) + } catch(Exception e){ + System.err.println(e.toString()); + } finally { + try { br.close(); } + catch (IOException e) {System.err.println(e.toString());} + } + + +// conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNP.class); //Dirty + conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNPClean.class); //Clean-Clean ER + conf.setReducerClass(entityBased.EntityBasedReducerCEPFinalEJS.class); + + conf.setNumReduceTasks(224); + + conf.setCompressMapOutput(true); + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + conf.setMaxReduceTaskFailuresPercent(10); + conf.set("mapred.reduce.max.attempts", "10"); + conf.set("mapred.max.tracker.failures", "100"); + conf.set("mapred.job.tracker.handler.count", "40"); + + + client.setConf(conf); + + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} \ No newline at end of file diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCNP.java b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCNP.java new file mode 100644 index 0000000..fb015ff --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCNP.java @@ -0,0 +1,118 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URI; +import java.net.URISyntaxException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.JobID; +import org.apache.hadoop.mapred.RunningJob; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +import preprocessing.VIntArrayWritable; + + +public class EntityBasedDriverCNP { + + /** + * + * @param args can be 3 or 4 arguments:
+ * args[0] is the weighting scheme
+ * args[1] is the input (the blocking collection after block filtering)
+ * if the weighting scheme (args[0]) is "CBS" then:
+ * args[2] is the output path
+ * else
+ * args[2] is the blocks per entity file path and
+ * args[3] is the output path
+ */ + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(entityBased.EntityBasedDriverCNP.class); + + conf.setJobName("Entity Based CNP"); + + conf.setOutputKeyClass(VIntWritable.class); + conf.setOutputValueClass(VIntArrayWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + conf.set("weightingScheme", args[0]); + FileInputFormat.setInputPaths(conf, new Path(args[1])); //blocking collection + + if (args[0].equals("CBS")) { + FileOutputFormat.setOutputPath(conf, new Path(args[2])); //entity-based output + } else { + try { + DistributedCache.addCacheFile(new URI(args[2]), conf); //blocks per entity + } catch (URISyntaxException e1) { + System.err.println(e1.toString()); + } + FileOutputFormat.setOutputPath(conf, new Path(args[3])); //entity-based output + } + +// conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNP.class); //Dirty + conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNPClean.class); //Clean-Clean ER + conf.setReducerClass(entityBased.EntityBasedReducerCNP.class); + + conf.setNumReduceTasks(504); + + conf.setCompressMapOutput(true); + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + + conf.set("io.sort.mb", "400"); + + + BufferedReader br = null; + try{ + Path pt=new Path("/user/hduser/BCin.txt"); +// Path cleanPath=new Path("/user/hduser/numBlocksClean.txt"); +// Path dirtyPath=new Path("/user/hduser/numBlocksDirty.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + br=new BufferedReader(new InputStreamReader(fs.open(pt))); + Float BCin = Float.parseFloat(br.readLine()); + conf.setFloat("BCin", BCin); +// br2=new BufferedReader(new InputStreamReader(fs.open(cleanPath))); +// Integer cleanBlocks = Integer.parseInt(br2.readLine()); +// Path numEntitiesPath=new Path("/user/hduser/numEntities.txt"); +// FileSystem fs = FileSystem.get(new Configuration()); +// br=new BufferedReader(new InputStreamReader(fs.open(numEntitiesPath))); +// Integer numEntities = Integer.parseInt(br.readLine()); +// conf.setInt("numEntities", numEntities); + }catch(Exception e){ + System.err.println(e.toString()); + } finally { + try { br.close(); } + catch (IOException e) {System.err.println(e.toString());} + } + + + client.setConf(conf); + + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} \ No newline at end of file diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCNPARCS.java b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCNPARCS.java new file mode 100644 index 0000000..902f670 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCNPARCS.java @@ -0,0 +1,86 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +import preprocessing.VIntArrayWritable; + + +public class EntityBasedDriverCNPARCS { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(entityBased.EntityBasedDriverCNPARCS.class); + + conf.setJobName("Entity Based CNP ARCS"); + + conf.setOutputKeyClass(VIntWritable.class); + conf.setOutputValueClass(VIntArrayWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //blocking collection + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //entity-based output + + //Dirty ER +// conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNP.class); +// conf.setReducerClass(entityBased.EntityBasedReducerCNPARCSDirty.class); + + //Clean-Clean ER + conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNPARCSClean.class); + conf.setReducerClass(entityBased.EntityBasedReducerCNPARCSClean.class); + + conf.setNumReduceTasks(504); + + conf.setCompressMapOutput(true); + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + + conf.set("io.sort.mb", "400"); + + + BufferedReader br = null; + try{ + Path pt=new Path("/user/hduser/BCin.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + br=new BufferedReader(new InputStreamReader(fs.open(pt))); + Float BCin = Float.parseFloat(br.readLine()); + conf.setFloat("BCin", BCin); + }catch(Exception e){ + System.err.println(e.toString()); + } finally { + try { br.close(); } + catch (IOException e) {System.err.println(e.toString());} + } + + + client.setConf(conf); + + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} \ No newline at end of file diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCNPEJS.java b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCNPEJS.java new file mode 100644 index 0000000..f584888 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverCNPEJS.java @@ -0,0 +1,110 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URI; +import java.net.URISyntaxException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +import preprocessing.VIntArrayWritable; + + +public class EntityBasedDriverCNPEJS { + + /** + * + * @param args should have 3 elements: + * args[0]: input blocking collection + * args[1]: blocks per entity (file to be stored in distributed cache) + * args[2]: output of EJS + * + * comparisons are stored in HDFS from the NodeDegree job + * comparisonsPerEntity are stored in HDFS from the NodeDegree job (after a getmerge and a copyFromLocal) in /user/hduser/nodeDegrees.txt + * BCin is stored in HDFS from the EntityIndex job + */ + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(entityBased.EntityBasedDriverCNPEJS.class); + + conf.setJobName("Entity Based CNP EJS"); + + conf.setOutputKeyClass(VIntWritable.class); + conf.setOutputValueClass(VIntArrayWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //blocking collection + FileOutputFormat.setOutputPath(conf, new Path(args[2])); //entity-based output + + +// conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNP.class); //dirty + conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNPClean.class); //clean-clean + conf.setReducerClass(entityBased.EntityBasedReducerCNPEJS.class); + + + conf.setNumReduceTasks(504); + + conf.setCompressMapOutput(true); + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + + conf.set("io.sort.mb", "400"); + + + BufferedReader br = null; + BufferedReader br2 = null; + try{ + Path pt=new Path("/user/hduser/comparisons.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + br=new BufferedReader(new InputStreamReader(fs.open(pt))); + Long comparisons = Long.parseLong(br.readLine()); + conf.setLong("comparisons", comparisons); + + Path pt2=new Path("/user/hduser/BCin.txt"); + br2=new BufferedReader(new InputStreamReader(fs.open(pt2))); + Float BCin = Float.parseFloat(br2.readLine()); + conf.setFloat("BCin", BCin); + }catch(Exception e){ + System.err.println(e.toString()); + } finally { + try { br.close(); br2.close();} + catch (IOException e) {System.err.println(e.toString());} + } + + try { + DistributedCache.addCacheFile(new URI(args[1]), conf); //blocks per entity + DistributedCache.addCacheFile(new URI("/user/hduser/nodeDegrees.txt"), conf); //comparisons per entity + } catch (URISyntaxException e1) { + System.err.println(e1.toString()); + } + + client.setConf(conf); + + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} \ No newline at end of file diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedDriverWEP.java b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverWEP.java new file mode 100644 index 0000000..64ae713 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverWEP.java @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.net.URI; +import java.net.URISyntaxException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.Counters; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RunningJob; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +import preprocessing.VIntArrayWritable; + + +public class EntityBasedDriverWEP { + + /** + * + * @param args can be 3 or 4 arguments:
+ * args[0] is the weighting scheme
+ * args[1] is the input (the blocking collection after block filtering)
+ * if the weighting scheme (args[0]) is "CBS" then:
+ * args[2] is the output path
+ * else
+ * args[2] is the blocks per entity file path and
+ * args[3] is the output path
+ */ + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(entityBased.EntityBasedDriverWEP.class); + + conf.setJobName("Entity Based WEP"); + + conf.setOutputKeyClass(VIntWritable.class); + conf.setOutputValueClass(VIntArrayWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + conf.set("weightingScheme", args[0]); + FileInputFormat.setInputPaths(conf, new Path(args[1])); //blocking collection + + if (args[0].equals("CBS")) { + FileOutputFormat.setOutputPath(conf, new Path(args[2])); //entity-based output + } else { + try { + DistributedCache.addCacheFile(new URI(args[2]), conf); //blocks per entity + } catch (URISyntaxException e1) { + System.err.println(e1.toString()); + } + FileOutputFormat.setOutputPath(conf, new Path(args[3])); //entity-based output + } + + conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNP.class); //Dirty +// conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNPClean.class); //Clean-Clean ER + conf.setReducerClass(entityBased.EntityBasedReducerWEP.class); + + conf.setNumReduceTasks(224); + + conf.setCompressMapOutput(true); + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + + + try{ + Path pt=new Path("/user/hduser/averageWeight.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(pt))); + String weight = br.readLine(); + br.close(); + conf.set("averageWeight", weight); //written from AverageWeight job + }catch(Exception e){ + System.err.println(e.toString()); + } + + + client.setConf(conf); + RunningJob job = null; + try { + job = JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + + + } + +} \ No newline at end of file diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedDriverWEPARCS.java b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverWEPARCS.java new file mode 100644 index 0000000..7c1cf21 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverWEPARCS.java @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.Counters; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RunningJob; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +import preprocessing.VIntArrayWritable; + + +public class EntityBasedDriverWEPARCS { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(entityBased.EntityBasedDriverWEPARCS.class); + + conf.setJobName("Entity Based WEP ARCS"); + + conf.setOutputKeyClass(VIntWritable.class); + conf.setOutputValueClass(VIntArrayWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //blocking collection + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //entity-based output + + //Dirty ER +// conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNP.class); +// conf.setReducerClass(entityBased.EntityBasedReducerWEPARCSDirty.class); + + //Clean-Clean ER + conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNPARCSClean.class); + conf.setReducerClass(entityBased.EntityBasedReducerWEPARCSClean.class); + + conf.setNumReduceTasks(224); + + conf.setCompressMapOutput(true); + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + + + try{ + Path pt=new Path("/user/hduser/averageWeight.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + BufferedReader br2=new BufferedReader(new InputStreamReader(fs.open(pt))); + String weight = br2.readLine(); + br2.close(); + conf.set("averageWeight", weight); //written from AverageWeight job + }catch(Exception e){ + System.err.println(e.toString()); + } + + + client.setConf(conf); + RunningJob job = null; + try { + job = JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + + + } + +} \ No newline at end of file diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedDriverWEPEJS.java b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverWEPEJS.java new file mode 100644 index 0000000..b1bec89 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverWEPEJS.java @@ -0,0 +1,115 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URI; +import java.net.URISyntaxException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +import preprocessing.VIntArrayWritable; + + +public class EntityBasedDriverWEPEJS { + + /** + * + * @param args should have 3 elements: + * args[0]: input blocking collection + * args[1]: blocks per entity (file to be stored in distributed cache) + * args[2]: output of EJS + * + * comparisons are stored in HDFS from the NodeDegree job + * comparisonsPerEntity are stored in HDFS from the NodeDegree job (after a getmerge and a copyFromLocal) in /user/hduser/nodeDegrees.txt + * BCin is stored in HDFS from the EntityIndex job + */ + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(entityBased.EntityBasedDriverWEPEJS.class); + + conf.setJobName("Entity Based WEP EJS"); + + conf.setOutputKeyClass(VIntWritable.class); + conf.setOutputValueClass(VIntArrayWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //blocking collection + FileOutputFormat.setOutputPath(conf, new Path(args[2])); //entity-based output + + +// conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNP.class); //dirty + conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNPClean.class); //clean-clean + conf.setReducerClass(entityBased.EntityBasedReducerAverageWeightEJS.class); + + + conf.setNumReduceTasks(224); + + conf.setCompressMapOutput(true); + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + + + BufferedReader br = null; + try{ + Path pt=new Path("/user/hduser/comparisons.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + br=new BufferedReader(new InputStreamReader(fs.open(pt))); + Long comparisons = Long.parseLong(br.readLine()); + conf.setLong("comparisons", comparisons); + }catch(Exception e){ + System.err.println(e.toString()); + } finally { + try { br.close();} + catch (IOException e) {System.err.println(e.toString());} + } + + try{ + Path pt=new Path("/user/hduser/averageWeight.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + BufferedReader br2=new BufferedReader(new InputStreamReader(fs.open(pt))); + String weight = br2.readLine(); + br2.close(); + conf.set("averageWeight", weight); //written from AverageWeight job + }catch(Exception e){ + System.err.println(e.toString()); + } + + try { + DistributedCache.addCacheFile(new URI(args[1]), conf); //blocks per entity + DistributedCache.addCacheFile(new URI("/user/hduser/nodeDegrees.txt"), conf); //comparisons per entity + } catch (URISyntaxException e1) { + System.err.println(e1.toString()); + } + + client.setConf(conf); + + + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + + } + +} \ No newline at end of file diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedDriverWNP.java b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverWNP.java new file mode 100644 index 0000000..c44f022 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverWNP.java @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URI; +import java.net.URISyntaxException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +import preprocessing.VIntArrayWritable; + + +public class EntityBasedDriverWNP { + + /** + * + * @param args can be 3 or 4 arguments:
+ * args[0] is the weighting scheme
+ * args[1] is the input (the blocking collection after block filtering)
+ * if the weighting scheme (args[0]) is "CBS" then:
+ * args[2] is the output path
+ * else
+ * args[2] is the blocks per entity file path and
+ * args[3] is the output path
+ */ + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(entityBased.EntityBasedDriverWNP.class); + + conf.setJobName("Entity Based WNP"); + + conf.setOutputKeyClass(VIntWritable.class); + conf.setOutputValueClass(VIntArrayWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + conf.set("weightingScheme", args[0]); + FileInputFormat.setInputPaths(conf, new Path(args[1])); //blocking collection + + if (args[0].equals("CBS")) { + FileOutputFormat.setOutputPath(conf, new Path(args[2])); //entity-based output + } else { + try { + DistributedCache.addCacheFile(new URI(args[2]), conf); //blocks per entity + } catch (URISyntaxException e1) { + System.err.println(e1.toString()); + } + FileOutputFormat.setOutputPath(conf, new Path(args[3])); //entity-based output + } + +// conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNP.class); //Dirty + conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNPClean.class); //Clean-Clean ER + conf.setReducerClass(entityBased.EntityBasedReducerWNP.class); + + conf.setNumReduceTasks(224); + + conf.setCompressMapOutput(true); + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + + client.setConf(conf); + + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} \ No newline at end of file diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedDriverWNPARCS.java b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverWNPARCS.java new file mode 100644 index 0000000..5eede5a --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverWNPARCS.java @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +import preprocessing.VIntArrayWritable; + + +public class EntityBasedDriverWNPARCS { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(entityBased.EntityBasedDriverWNPARCS.class); + + conf.setJobName("Entity Based WNP ARCS"); + + conf.setOutputKeyClass(VIntWritable.class); + conf.setOutputValueClass(VIntArrayWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //blocking collection + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //entity-based output + + //Dirty ER +// conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNP.class); +// conf.setReducerClass(entityBased.EntityBasedReducerWNPARCSDirty.class); + + //Clean-Clean ER + conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNPARCSClean.class); + conf.setReducerClass(entityBased.EntityBasedReducerWNPARCSClean.class); + + conf.setNumReduceTasks(224); + + conf.setCompressMapOutput(true); + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + + + client.setConf(conf); + + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} \ No newline at end of file diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedDriverWNPEJS.java b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverWNPEJS.java new file mode 100644 index 0000000..dc3dbee --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedDriverWNPEJS.java @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URI; +import java.net.URISyntaxException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +import preprocessing.VIntArrayWritable; + + +public class EntityBasedDriverWNPEJS { + + /** + * + * @param args should have 3 elements: + * args[0]: input blocking collection + * args[1]: blocks per entity (file to be stored in distributed cache) + * args[2]: output of EJS + * + * comparisons are stored in HDFS from the NodeDegree job + * comparisonsPerEntity are stored in HDFS from the NodeDegree job (after a getmerge and a copyFromLocal) in /user/hduser/nodeDegrees.txt + */ + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(entityBased.EntityBasedDriverWNPEJS.class); + + conf.setJobName("Entity Based WNP EJS"); + + conf.setOutputKeyClass(VIntWritable.class); + conf.setOutputValueClass(VIntArrayWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //blocking collection + FileOutputFormat.setOutputPath(conf, new Path(args[2])); //entity-based output + + + conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNP.class); //dirty +// conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNPClean.class); //clean-clean + conf.setReducerClass(entityBased.EntityBasedReducerWNPEJS.class); + + + conf.setNumReduceTasks(224); + + conf.setCompressMapOutput(true); + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + + + BufferedReader br = null; + try{ + Path pt=new Path("/user/hduser/comparisons.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + br=new BufferedReader(new InputStreamReader(fs.open(pt))); + Long comparisons = Long.parseLong(br.readLine()); + conf.setLong("comparisons", comparisons); + }catch(Exception e){ + System.err.println(e.toString()); + } finally { + try { br.close(); } + catch (IOException e) {System.err.println(e.toString());} + } + + try { + DistributedCache.addCacheFile(new URI(args[1]), conf); //blocks per entity + DistributedCache.addCacheFile(new URI("/user/hduser/nodeDegrees.txt"), conf); //comparisons per entity + } catch (URISyntaxException e1) { + System.err.println(e1.toString()); + } + + client.setConf(conf); + + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} \ No newline at end of file diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedIndexDriver.java b/MetaBlocking/src/main/java/entityBased/EntityBasedIndexDriver.java new file mode 100644 index 0000000..ee68093 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedIndexDriver.java @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +import preprocessing.VIntArrayWritable; + + + +public class EntityBasedIndexDriver { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(preprocessing.EntityIndexDriver.class); + + conf.setJobName("Entity Index (With Block Filtering)"); + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(VIntWritable.class); + + conf.setOutputKeyClass(VIntWritable.class); + conf.setOutputValueClass(VIntArrayWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + //conf.setOutputFormat(TextOutputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //blocking collection + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //entity-based index + + conf.setMapperClass(entityBased.EntityBasedIndexMapper.class); + conf.setReducerClass(entityBased.EntityBasedIndexReducerMemory.class); + + conf.setNumReduceTasks(56); + + conf.setInt("mapred.task.timeout", 10000000); + + + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} \ No newline at end of file diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedIndexMapper.java b/MetaBlocking/src/main/java/entityBased/EntityBasedIndexMapper.java new file mode 100644 index 0000000..72f57f0 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedIndexMapper.java @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + +import java.io.IOException; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +public class EntityBasedIndexMapper extends MapReduceBase implements Mapper { + + VIntWritable entityId = new VIntWritable(); + /** + * input: a blocking collection + * input key: block id + * input value: entity ids in this block, separated by "#" + * output key: entity id (each of the input values) + * output value: block id (the same as the input key) + */ + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + +// String valueString = value.toString().replaceFirst(";", ""); //clean +// String []entities = valueString.split("#"); //clean + String []entities = value.toString().split("#"); //dirty +// VIntWritable[] entities = value.get(); + + for (String entity : entities) { + entityId.set(Integer.parseInt(entity)); +// if (entity == null) { continue; } + output.collect(entityId, key); + } + } + + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedIndexReducer.java b/MetaBlocking/src/main/java/entityBased/EntityBasedIndexReducer.java new file mode 100644 index 0000000..11c6870 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedIndexReducer.java @@ -0,0 +1,129 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +import preprocessing.VIntArrayWritable; + +public class EntityBasedIndexReducer extends MapReduceBase implements Reducer { + + + static enum OutputData {D1Entities, D2Entities, BLOCK_ASSIGNMENTS, CACHE_HITS}; + + Map> blockCache; + Set entities; //the entities that belong to a common block with _key entity + + public void configure(JobConf conf) { + blockCache = new HashMap<>(); //caches the block contents of each reducer (local) + } + + /** + * + * @param _key entity id + * @param values block ids of the current entity + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + Set blocks = new HashSet<>(); //the blocks of the _key entity + entities = new TreeSet<>(); + + while (values.hasNext()) { + Integer block = values.next().get(); //the block id + blocks.add(block); + } + + addToCache(blocks, reporter); + + reporter.setStatus("Cached blocks"); + for (Integer block : blocks) { + entities.addAll(blockCache.get(block)); + } + + entities.remove(_key); + reporter.setStatus("Writing blocks"); + + VIntWritable[] tmpArray = new VIntWritable[entities.size()]; + tmpArray = entities.toArray(tmpArray); + VIntArrayWritable toEmit = new VIntArrayWritable(tmpArray); + + output.collect(_key, toEmit); + + + } + + + private void addToCache(Set blocks, Reporter reporter) { + + Set newBlocks = new HashSet<>(blocks); + newBlocks.removeAll(blockCache.keySet()); + reporter.incrCounter(OutputData.CACHE_HITS, blocks.size()-newBlocks.size()); + + if (newBlocks.isEmpty()) { //all blocks are cached + return; + } + + + List blockEntitiesList = new ArrayList<>(); //the entities of this block + + + + + //do a single scan in the input blocking collection + BufferedReader br=null; + try{ +// FileSystem fs = FileSystem.get(new Configuration()); +// Path inFile = new Path("/user/hduser/dbpediaDirtyRaw.txt"); +// br = new BufferedReader(new InputStreamReader(fs.open(inFile))); //OPTION 1: read from HDFS + br = new BufferedReader(new FileReader("/home/user/dbpediaDirtyRaw.txt")); //OPTION 2: read from local FS + String line; + while ((line = br.readLine()) != null) { + reporter.progress(); + String block[] = line.split("\t"); //first part is id, second part is contents (entity Ids) + int blockId = Integer.parseInt(block[0]); + if (newBlocks.contains(blockId)) { + reporter.setStatus("Adding the contents of block: "+blockId); + String[] blockEntities = block[1].split("#"); + for (String eId : blockEntities) { + if (eId != "") { + blockEntitiesList.add(new VIntWritable(Integer.parseInt(eId))); + } + } + blockCache.put(blockId, blockEntitiesList); +// entities.addAll(blockEntitiesList); + } +// blocks.remove(blockId); //to free some space + } + }catch(Exception e){ + System.err.println(e.toString()); + } finally { + try { br.close();} + catch (IOException e) {System.err.println(e.toString());} + } + } + + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedIndexReducerMemory.java b/MetaBlocking/src/main/java/entityBased/EntityBasedIndexReducerMemory.java new file mode 100644 index 0000000..68ea49d --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedIndexReducerMemory.java @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +import preprocessing.VIntArrayWritable; + +public class EntityBasedIndexReducerMemory extends MapReduceBase implements Reducer { + + + static enum OutputData {D1Entities, D2Entities, BLOCK_ASSIGNMENTS, CACHE_HITS}; + + Map> blockCache; + Set entities; //the entities that belong to a common block with _key entity + + public void configure(JobConf conf) { + blockCache = new HashMap<>(); //caches the block contents of each reducer (local) + //do a single scan in the input blocking collection to load it in memory + BufferedReader br=null; + try{ +// FileSystem fs = FileSystem.get(new Configuration()); +// Path inFile = new Path("/user/hduser/dbpediaDirtyRaw.txt"); +// br = new BufferedReader(new InputStreamReader(fs.open(inFile))); //OPTION 1: read from HDFS + br = new BufferedReader(new FileReader("/home/user/dbpediaDirtyRaw.txt")); //OPTION 2: read from local FS + String line; + while ((line = br.readLine()) != null) { + List blockEntitiesList = new ArrayList<>(); //the entities of this block + String block[] = line.split("\t"); //first part is id, second part is contents (entity Ids) + int blockId = Integer.parseInt(block[0]); + String[] blockEntities = block[1].split("#"); + for (String eId : blockEntities) { + if (eId != "") { + blockEntitiesList.add(new VIntWritable(Integer.parseInt(eId))); + } + } + blockCache.put(blockId, blockEntitiesList); + } + }catch(Exception e){ + System.err.println(e.toString()); + } finally { + try { br.close();} + catch (IOException e) {System.err.println(e.toString());} + } + } + + /** + * + * @param _key entity id + * @param values block ids of the current entity + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + Set blocks = new HashSet<>(); //the blocks of the _key entity + entities = new TreeSet<>(); + + while (values.hasNext()) { + Integer block = values.next().get(); //the block id + blocks.add(block); + } + + for (Integer block : blocks) { + entities.addAll(blockCache.get(block)); + } + + entities.remove(_key); + reporter.setStatus("Writing blocks"); + + VIntWritable[] tmpArray = new VIntWritable[entities.size()]; + tmpArray = entities.toArray(tmpArray); + VIntArrayWritable toEmit = new VIntArrayWritable(tmpArray); + + output.collect(_key, toEmit); + + + } + + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedMapper.java b/MetaBlocking/src/main/java/entityBased/EntityBasedMapper.java new file mode 100644 index 0000000..9ed0ae8 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedMapper.java @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +import preprocessing.VIntArrayWritable; + +public class EntityBasedMapper extends MapReduceBase implements Mapper { + + VIntArrayWritable toEmit = new VIntArrayWritable(); + /** + * input: a blocking collection + * input key: block id + * input value: entity ids in this block, separated by "#" + * output key: entity id (each of the input values) + * output value: an array with all other entities (ids) in this block + */ + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + +// String valueString = value.toString().replaceFirst(";", ""); //clean +// String []entities = valueString.split("#"); //clean + String []entities = value.toString().split("#"); //dirty +// VIntWritable[] entities = value.get(); + +// VIntWritable[] array = new VIntWritable[entities.length]; + List entityList = new ArrayList<>(entities.length); + + for (int i = 0; i < entities.length; ++i) { + entityList.add(i, new VIntWritable(Integer.parseInt(entities[i]))); + } + + for (int i = 0; i < entities.length; ++i) { + reporter.setStatus((i+1)+"/"+entities.length); + List tmp = new ArrayList<>(entityList); + tmp.remove(i); //remove element at position i (not entity with id i) + + + VIntWritable[] tmpArray = new VIntWritable[tmp.size()]; + toEmit.set(tmp.toArray(tmpArray)); +// VIntArrayWritable toEmit = new VIntArrayWritable((VIntWritable[])tmp.toArray()); + output.collect(entityList.get(i), toEmit); + } + + + } + + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedMapperFromCompressed.java b/MetaBlocking/src/main/java/entityBased/EntityBasedMapperFromCompressed.java new file mode 100644 index 0000000..0cd237e --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedMapperFromCompressed.java @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + +import hadoopUtils.RelativePositionCompression; + +import java.io.IOException; +import java.util.Arrays; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +import preprocessing.VIntArrayWritable; + +public class EntityBasedMapperFromCompressed extends MapReduceBase implements Mapper { + + VIntArrayWritable toEmit = new VIntArrayWritable(); + /** + * input: a blocking collection + * input key: block id + * input value: entity ids in this block + * output key: entity id (each of the input values) + * output value: an array with all other entities (ids) in this block + */ + public void map(VIntWritable key, VIntArrayWritable value, + OutputCollector output, Reporter reporter) throws IOException { + + VIntWritable[] entities = value.get(); + + Arrays.sort(entities); //do it once, to save doing it in compression + + +// for (int i = 0; i < entities.length; ++i) { //when all entities are copied (OPTION 1) + for (int i = 0; i < entities.length-1; ++i) { //when bigger entities are copied (OPTION 2) + reporter.setStatus((i+1)+"/"+entities.length); + //OPTION 1: copy all elements before and after i +// VIntWritable[] tmpEntities = new VIntWritable[entities.length-1]; //all other entities, except the one at index i +// +// System.arraycopy(entities, 0, tmpEntities, 0, i); //all elements before i +// System.arraycopy(entities, i+1, tmpEntities, i, entities.length-i-1); //all elements after i + + //OPTION 2: copy only the elements after i + VIntWritable[] tmpEntities = new VIntWritable[entities.length-i-1]; //all BIGGER entities + System.arraycopy(entities, i+1, tmpEntities, 0, tmpEntities.length); //all elements after i + + output.collect(entities[i], RelativePositionCompression.compress(tmpEntities)); + } + + + } + + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedMapperFromCompressedNP.java b/MetaBlocking/src/main/java/entityBased/EntityBasedMapperFromCompressedNP.java new file mode 100644 index 0000000..50dd40d --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedMapperFromCompressedNP.java @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + +import hadoopUtils.RelativePositionCompression; + +import java.io.IOException; +import java.util.Arrays; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +import preprocessing.VIntArrayWritable; + +public class EntityBasedMapperFromCompressedNP extends MapReduceBase implements Mapper { + + VIntArrayWritable toEmit = new VIntArrayWritable(); + /** + * input: a blocking collection + * @param key block id + * @param value entity ids in this block + * @param output key: entity id (each of the input values) + * value: an array with all other entities (ids) in this block + */ + public void map(VIntWritable key, VIntArrayWritable value, + OutputCollector output, Reporter reporter) throws IOException { + + VIntWritable[] entities = value.get(); + Arrays.sort(entities); + VIntArrayWritable outputEntities = RelativePositionCompression.compress(entities); + for (int i = 0; i < entities.length; ++i) { + reporter.setStatus((i+1)+"/"+entities.length); + output.collect(entities[i], outputEntities); + } + } +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedMapperFromCompressedNPARCSClean.java b/MetaBlocking/src/main/java/entityBased/EntityBasedMapperFromCompressedNPARCSClean.java new file mode 100644 index 0000000..4738066 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedMapperFromCompressedNPARCSClean.java @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + +import hadoopUtils.RelativePositionCompression; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +import preprocessing.VIntArrayWritable; + +public class EntityBasedMapperFromCompressedNPARCSClean extends MapReduceBase implements Mapper { + + VIntArrayWritable toEmit = new VIntArrayWritable(); + /** + * input: a blocking collection + * @param key block id + * @param value entity ids in this block + * @param output key: entity id (each of the input values) + * value: an array with all other entities (ids) in this block + */ + @SuppressWarnings("unchecked") + public void map(VIntWritable key, VIntArrayWritable value, + OutputCollector output, Reporter reporter) throws IOException { + + VIntWritable[] entities = value.get(); + + //separate positives from negatives + List positives = new ArrayList<>(); + List negatives = new ArrayList<>(); + + for (int i = 0; i < entities.length; ++i) { + if (entities[i].get() < 0) { + negatives.add(entities[i]); + } else { + positives.add(entities[i]); + } + } + + if (positives.isEmpty() || negatives.isEmpty()) { + return; //purged block (no comparisons) + } + + Collections.sort(positives); //sort positives in ascending order + Collections.sort(negatives, Collections.reverseOrder()); //sort negatives in descending order (saves more space in compression) + + //store the number of entities in the other dataset, as an extra element, placed first + //so that block cardinality = |positives|*|negatives| can be retrieved in the reducer + final int numPositives = positives.size(); + final int numNegatives = negatives.size(); + positives.add(0, new VIntWritable(numNegatives)); //add the #negatives as the first element of positives list + negatives.add(0, new VIntWritable(numPositives)); //add the #positives as the first element of negatives list + + //compress the two arrays once + VIntWritable[] positivesArray = new VIntWritable[positives.size()]; + VIntWritable[] negativesArray = new VIntWritable[negatives.size()]; + + VIntArrayWritable positiveEntities = RelativePositionCompression.compressFromSecond(positives.toArray(positivesArray)); + VIntArrayWritable negativeEntities = RelativePositionCompression.compressFromSecond(negatives.toArray(negativesArray)); + + //emit all the negative entities array (compressed) for each positive entity + //the first element of the array is the number of positiveEntities + for (int i = 0; i < positivesArray.length; ++i) { + reporter.setStatus((i+1)+"/"+positivesArray.length+" positives"); + output.collect(positivesArray[i], negativeEntities); + } + + //emit all the positive entities array (compressed) for each negative entity + //the first element of the array is the number of negativeEntities + for (int i = 0; i < negativesArray.length; ++i) { + reporter.setStatus((i+1)+"/"+negativesArray.length+" negatives"); + output.collect(negativesArray[i], positiveEntities); + } + } +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedMapperFromCompressedNPClean.java b/MetaBlocking/src/main/java/entityBased/EntityBasedMapperFromCompressedNPClean.java new file mode 100644 index 0000000..4ac2d94 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedMapperFromCompressedNPClean.java @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + +import hadoopUtils.RelativePositionCompression; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +import preprocessing.VIntArrayWritable; + +public class EntityBasedMapperFromCompressedNPClean extends MapReduceBase implements Mapper { + + VIntArrayWritable toEmit = new VIntArrayWritable(); + /** + * input: a blocking collection + * @param key block id + * @param value entity ids in this block + * @param output key: entity id (each of the input values) + * value: an array with all other entities (ids) in this block + */ + @SuppressWarnings("unchecked") + public void map(VIntWritable key, VIntArrayWritable value, + OutputCollector output, Reporter reporter) throws IOException { + + VIntWritable[] entities = value.get(); + + //separate positives from negatives + List positives = new ArrayList<>(); + List negatives = new ArrayList<>(); + + for (int i = 0; i < entities.length; ++i) { + if (entities[i].get() < 0) { + negatives.add(entities[i]); + } else { + positives.add(entities[i]); + } + } + + if (positives.isEmpty() || negatives.isEmpty()) { + return; //purged block (no comparisons) + } + + Collections.sort(positives); //sort positives in ascending order + Collections.sort(negatives, Collections.reverseOrder()); //sort negatives in descending order (saves more space in compression) + + //compress the two arrays once + VIntWritable[] positivesArray = new VIntWritable[positives.size()]; + VIntWritable[] negativesArray = new VIntWritable[negatives.size()]; + VIntArrayWritable positiveEntities = RelativePositionCompression.compress(positives.toArray(positivesArray)); + VIntArrayWritable negativeEntities = RelativePositionCompression.compress(negatives.toArray(negativesArray)); + + //emit all the negative entities array (compressed) for each positive entity + for (int i = 0; i < positivesArray.length; ++i) { + reporter.setStatus((i+1)+"/"+positivesArray.length+" positives"); + output.collect(positivesArray[i], negativeEntities); + } + + //emit all the positive entities array (compressed) for each negative entity + for (int i = 0; i < negativesArray.length; ++i) { + reporter.setStatus((i+1)+"/"+negativesArray.length+" negatives"); + output.collect(negativesArray[i], positiveEntities); + } + } +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedReducer.java b/MetaBlocking/src/main/java/entityBased/EntityBasedReducer.java new file mode 100644 index 0000000..8b2c818 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedReducer.java @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import hadoopUtils.RelativePositionCompression; + +import java.io.IOException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +import preprocessing.VIntArrayWritable; + +public class EntityBasedReducer extends MapReduceBase implements Reducer { + + + /** + * @param _key an entity id + * @param values the list of arrays with entity ids appearing in a block with the _key entity + * @param output the input with the values deduplicated (i.e., each entity appearing only once) + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + Set entities = new HashSet<>(); + + while (values.hasNext()) { +// VIntWritable[] next = values.next().get(); //if not compressed + VIntWritable[] next = RelativePositionCompression.uncompress(values.next()); //if compressed + entities.addAll(Arrays.asList(next)); + } + + VIntWritable[] tmpArray = new VIntWritable[entities.size()]; + VIntArrayWritable toEmit = new VIntArrayWritable(entities.toArray(tmpArray)); + + output.collect(_key, toEmit); + } + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedReducerAverageWeight.java b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerAverageWeight.java new file mode 100644 index 0000000..7d7dc05 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerAverageWeight.java @@ -0,0 +1,130 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import hadoopUtils.RelativePositionCompression; + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; + + +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.JobID; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.RunningJob; + +import preprocessing.VIntArrayWritable; + +public class EntityBasedReducerAverageWeight extends MapReduceBase implements Reducer { + + VIntWritable neighborToEmit = new VIntWritable(); + + public enum Weight {WEIGHT_COUNTER, NUM_EDGES}; + + private double totalBlocks; //for ECBS + private Map counters; //key: neighborId, value: #common Blocks + private Map blocksPerEntity; //key: entityId, value: #blocks containing this entity + + + private Path[] localFiles; + private String weightingScheme = "CBS"; + + + public void configure (JobConf conf) { + counters = new HashMap<>(); + blocksPerEntity = new HashMap<>(); + + weightingScheme = conf.get("weightingScheme", "CBS"); //default weighting scheme is CBS + + if (!weightingScheme.equals("CBS")) { //nothing more is needed for CBS + BufferedReader SW; + try { + localFiles = DistributedCache.getLocalCacheFiles(conf); + SW = new BufferedReader(new FileReader(localFiles[0].toString())); + String line; + while ((line = SW.readLine())!= null) { + String[] split = line.split("\t"); + blocksPerEntity.put(Integer.parseInt(split[0]), Integer.parseInt(split[1])); + } + SW.close(); + } catch (FileNotFoundException e) { + System.err.println(e.toString()); + } catch (IOException e) { + System.err.println(e.toString()); + } + if (weightingScheme.equals("ECBS")) { //then we also need #totalBlocks + try { + JobClient client = new JobClient(conf); + RunningJob parentJob = client.getJob(JobID.forName(conf.get("mapred.job.id"))); + totalBlocks = parentJob.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", + "MAP_INPUT_RECORDS").getCounter(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + } + + /** + * @param _key an entity id + * @param values the list of arrays with entity ids appearing in a block with the _key entity + * @param output the input with the values deduplicated (i.e., each entity appearing only once) + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + int entityId = _key.get(); + + counters = new HashMap<>(); + while (values.hasNext()) { + VIntWritable[] next = RelativePositionCompression.uncompress(values.next()); + for (VIntWritable neighborId : next) { + if (neighborId.equals(_key)) { + continue; + } + int neighbor = neighborId.get(); + + Double count = counters.get(neighbor); + if (count == null) { + count = 0.0; + } + counters.put(neighbor, count+1); + } + } + + double sumWeight = 0; + + for (int neighborId : counters.keySet()) { + switch (weightingScheme) { + case "CBS": + sumWeight += counters.get(neighborId);//[neighborId]; // CBS + break; + case "ECBS": + sumWeight += counters.get(neighborId)*Math.log10(totalBlocks/blocksPerEntity.get(entityId))*Math.log10(totalBlocks/blocksPerEntity.get(neighborId)); // ECBS + break; + case "JS": + sumWeight += counters.get(neighborId)/(blocksPerEntity.get(entityId)+blocksPerEntity.get(neighborId)-counters.get(neighborId)); // JS + break; + default: + sumWeight += 0; + } + } + + reporter.incrCounter(Weight.WEIGHT_COUNTER, new Double(sumWeight*1000).longValue()); + reporter.incrCounter(Weight.NUM_EDGES, counters.keySet().size()); + } + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedReducerAverageWeightARCSClean.java b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerAverageWeightARCSClean.java new file mode 100644 index 0000000..7ae61da --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerAverageWeightARCSClean.java @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import hadoopUtils.RelativePositionCompression; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; +import preprocessing.VIntArrayWritable; + +public class EntityBasedReducerAverageWeightARCSClean extends MapReduceBase implements Reducer { + + public enum Weight {WEIGHT_COUNTER, NUM_EDGES}; + + + /** + * @param _key an entity id + * @param values the list of arrays with entity ids appearing in a block with the _key entity + * @param output the input with the values deduplicated (i.e., each entity appearing only once) + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + long bComparisons; + Map weights = new HashMap<>(); //key: neighborId, value: ARCS weightweights = new HashMap<>(); + while (values.hasNext()) { + VIntWritable[] next = RelativePositionCompression.uncompressFromSecond(values.next()); + //in clean-clean ER ||b|| = positives * negatives + bComparisons = (next.length-1) * next[0].get(); //next.length-1, because next[0] is the number of entities in the other collection, not an entity id + + for (int i = 1; i < next.length; i++) { + VIntWritable neighborId = next[i]; + if (neighborId.equals(_key)) { + continue; + } + int neighbor = neighborId.get(); + + Double prevWeight = weights.get(neighbor); + if (prevWeight == null) { + prevWeight = 0.0; + } + weights.put(neighbor, prevWeight + 1.0/bComparisons); + } + } + + for (Double weight : weights.values()) { //iterate over the weights only (ignore labels) + reporter.incrCounter(Weight.WEIGHT_COUNTER, new Double(weight*1000).longValue()); + reporter.incrCounter(Weight.NUM_EDGES, 1); + } + + + + + } + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedReducerAverageWeightARCSDirty.java b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerAverageWeightARCSDirty.java new file mode 100644 index 0000000..8a5ec16 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerAverageWeightARCSDirty.java @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import hadoopUtils.RelativePositionCompression; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; +import preprocessing.VIntArrayWritable; + +public class EntityBasedReducerAverageWeightARCSDirty extends MapReduceBase implements Reducer { + + + public enum Weight {WEIGHT_COUNTER, NUM_EDGES}; + + + /** + * @param _key an entity id + * @param values the list of arrays with entity ids appearing in a block with the _key entity + * @param output the input with the values deduplicated (i.e., each entity appearing only once) + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + long bComparisons; + Map weights = new HashMap<>(); //key: neighborId, value: ARCS weightweights = new HashMap<>(); + while (values.hasNext()) { + VIntWritable[] next = RelativePositionCompression.uncompress(values.next()); + //in dirty ER ||b|| = (|b| * |b|-1) /2 + bComparisons = ((next.length) * (next.length-1)) / 2; //cannot be zero + for (VIntWritable neighborId : next) { + if (neighborId.equals(_key)) { + continue; + } + int neighbor = neighborId.get(); + + Double prevWeight = weights.get(neighbor); + if (prevWeight == null) { + prevWeight = 0.0; + } + weights.put(neighbor, prevWeight + 1.0/bComparisons); + } + } + + for (Double weight : weights.values()) { //iterate over the weights only (ignore labels) + reporter.incrCounter(Weight.WEIGHT_COUNTER, new Double(weight*1000).longValue()); + reporter.incrCounter(Weight.NUM_EDGES, 1); + } + + + + + } + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedReducerAverageWeightEJS.java b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerAverageWeightEJS.java new file mode 100644 index 0000000..3849bf1 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerAverageWeightEJS.java @@ -0,0 +1,131 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import hadoopUtils.RelativePositionCompression; + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; + + +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.JobID; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.RunningJob; + +import preprocessing.VIntArrayWritable; + +public class EntityBasedReducerAverageWeightEJS extends MapReduceBase implements Reducer { + + VIntWritable neighborToEmit = new VIntWritable(); + + public enum Weight {WEIGHT_COUNTER, NUM_EDGES}; + + private Map counters; //key: neighborId, value: #common Blocks + private Map blocksPerEntity; //key: entityId, value: #blocks containing this entity + private Map comparisonsPerEntity; //key: entityId, value: #unique comparisons of this entity + + + private Path[] localFiles; + + long comparisons; + + + public void configure (JobConf conf) { + counters = new HashMap<>(); + blocksPerEntity = new HashMap<>(); + comparisonsPerEntity = new HashMap<>(); + + comparisons = conf.getLong("comparisons", 0); //default #comparisons is 0 + + BufferedReader SW; + try { + localFiles = DistributedCache.getLocalCacheFiles(conf); + SW = new BufferedReader(new FileReader(localFiles[0].toString())); + String line; + while ((line = SW.readLine())!= null) { + String[] split = line.split("\t"); + blocksPerEntity.put(Integer.parseInt(split[0]), Integer.parseInt(split[1])); + } + SW.close(); + } catch (FileNotFoundException e) { + System.err.println(e.toString()); + } catch (IOException e) { + System.err.println(e.toString()); + } + + //comparisons per entity + try { + SW = new BufferedReader(new FileReader(localFiles[1].toString())); + String line; + while ((line = SW.readLine())!= null) { + String[] split = line.split("\t"); + comparisonsPerEntity.put(Integer.parseInt(split[0]), Integer.parseInt(split[1])); + } + SW.close(); + } catch (FileNotFoundException e) { + System.err.println(e.toString()); + } catch (IOException e) { + System.err.println(e.toString()); + } + + } + + /** + * @param _key an entity id + * @param values the list of arrays with entity ids appearing in a block with the _key entity + * @param output the input with the values deduplicated (i.e., each entity appearing only once) + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + int entityId = _key.get(); + + counters = new HashMap<>(); + while (values.hasNext()) { + VIntWritable[] next = RelativePositionCompression.uncompress(values.next()); + for (VIntWritable neighborId : next) { + if (neighborId.equals(_key)) { + continue; + } + int neighbor = neighborId.get(); + + Double count = counters.get(neighbor); + if (count == null) { + count = 0.0; + } + counters.put(neighbor, count+1); + } + } + + double currEntityWeight = Math.log10((double)comparisons/comparisonsPerEntity.get(entityId)); //pre-calculate this only once + int blocksOfCurrEntity = blocksPerEntity.get(entityId); //pre-calculate this only once + + double sumWeight = 0; + + //calculate the weights of the neighbors now + for (int neighborId : counters.keySet()) { + sumWeight += + (counters.get(neighborId)/(blocksOfCurrEntity+blocksPerEntity.get(neighborId)-counters.get(neighborId))) * + currEntityWeight * + Math.log10((double)comparisons/comparisonsPerEntity.get(neighborId)); + } + + reporter.incrCounter(Weight.WEIGHT_COUNTER, new Double(sumWeight*1000).longValue()); + reporter.incrCounter(Weight.NUM_EDGES, counters.keySet().size()); + } + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEP.java b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEP.java new file mode 100644 index 0000000..2c7be51 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEP.java @@ -0,0 +1,128 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import hadoopUtils.RelativePositionCompression; +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.JobID; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.RunningJob; + +import preprocessing.VIntArrayWritable; + +public class EntityBasedReducerCEP extends MapReduceBase implements Reducer { + + VIntWritable one = new VIntWritable(1); + DoubleWritable weightToEmit = new DoubleWritable(); + + + private double totalBlocks; //for ECBS + private Map counters; //key: neighborId, value: #common Blocks + private Map blocksPerEntity; //key: entityId, value: #blocks containing this entity + + + private Path[] localFiles; + private String weightingScheme = "CBS"; + + public void configure (JobConf conf) { + counters = new HashMap<>(); + blocksPerEntity = new HashMap<>(); + + weightingScheme = conf.get("weightingScheme", "CBS"); //default weighting scheme is CBS + + if (!weightingScheme.equals("CBS")) { //nothing more is needed for CBS + BufferedReader SW; + try { + localFiles = DistributedCache.getLocalCacheFiles(conf); + SW = new BufferedReader(new FileReader(localFiles[0].toString())); + String line; + while ((line = SW.readLine())!= null) { + String[] split = line.split("\t"); + blocksPerEntity.put(Integer.parseInt(split[0]), Integer.parseInt(split[1])); + } + SW.close(); + } catch (FileNotFoundException e) { + System.err.println(e.toString()); + } catch (IOException e) { + System.err.println(e.toString()); + } + if (weightingScheme.equals("ECBS")) { //then we also need #totalBlocks + try { + JobClient client = new JobClient(conf); + RunningJob parentJob = client.getJob(JobID.forName(conf.get("mapred.job.id"))); + totalBlocks = parentJob.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", + "MAP_INPUT_RECORDS").getCounter(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + } + + /** + * @param _key an entity id + * @param values the list of arrays with entity ids appearing in a block with the _key entity + * @param output the input with the values deduplicated (i.e., each entity appearing only once) + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + int entityId = _key.get(); + + counters = new HashMap<>(); + while (values.hasNext()) { + VIntWritable[] next = RelativePositionCompression.uncompress(values.next()); + for (VIntWritable neighborId : next) { + if (neighborId.equals(_key)) { + continue; + } + int neighbor = neighborId.get(); + + Double count = counters.get(neighbor); + if (count == null) { + count = 0.0; + } + counters.put(neighbor, count+1); + } + } + + //calculate the weights of the neighbors now + for (int neighborId : counters.keySet()) { + double currentWeight; + switch (weightingScheme) { + case "CBS": + currentWeight = counters.get(neighborId);//[neighborId]; // CBS + break; + case "ECBS": + currentWeight = counters.get(neighborId)*Math.log10(totalBlocks/blocksPerEntity.get(entityId))*Math.log10(totalBlocks/blocksPerEntity.get(neighborId)); // ECBS + break; + case "JS": + currentWeight = counters.get(neighborId)/(blocksPerEntity.get(entityId)+blocksPerEntity.get(neighborId)-counters.get(neighborId)); // JS + break; + default: + currentWeight = 0; + } + weightToEmit.set(currentWeight); + output.collect(weightToEmit, one); + } + + + } + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEPARCSClean.java b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEPARCSClean.java new file mode 100644 index 0000000..78e9af2 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEPARCSClean.java @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import hadoopUtils.RelativePositionCompression; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +import preprocessing.VIntArrayWritable; + +public class EntityBasedReducerCEPARCSClean extends MapReduceBase implements Reducer { + + VIntWritable one = new VIntWritable(1); + DoubleWritable weightToEmit = new DoubleWritable(); + + /** + * @param _key an entity id + * @param values the list of arrays with entity ids appearing in a block with the _key entity + * + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + long bComparisons; + Map weights = new HashMap<>(); //key: neighborId, value: ARCS weightweights = new HashMap<>(); + while (values.hasNext()) { + VIntWritable[] next = RelativePositionCompression.uncompress(values.next()); + //in clean-clean ER ||b|| = positives * negatives + bComparisons = (next.length-1) * next[0].get(); //next.length-1, because next[0] is the number of entities in the other collection, not an entity id + + for (VIntWritable neighborId : next) { + if (neighborId.equals(_key)) { + continue; + } + int neighbor = neighborId.get(); + + Double prevWeight = weights.get(neighbor); + if (prevWeight == null) { + prevWeight = 0.0; + } + weights.put(neighbor, prevWeight + 1.0/bComparisons); + } + } + + for (double weight : weights.values()) { //iterate over the weights only (ignore labels) + weightToEmit.set(weight); + output.collect(weightToEmit, one); + } + + + } + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEPARCSDirty.java b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEPARCSDirty.java new file mode 100644 index 0000000..9cca734 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEPARCSDirty.java @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import hadoopUtils.RelativePositionCompression; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +import preprocessing.VIntArrayWritable; + +public class EntityBasedReducerCEPARCSDirty extends MapReduceBase implements Reducer { + + VIntWritable one = new VIntWritable(1); + DoubleWritable weightToEmit = new DoubleWritable(); + + /** + * @param _key an entity id + * @param values the list of arrays with entity ids appearing in a block with the _key entity + * + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + long bComparisons; + Map weights = new HashMap<>(); //key: neighborId, value: ARCS weightweights = new HashMap<>(); + while (values.hasNext()) { + VIntWritable[] next = RelativePositionCompression.uncompress(values.next()); + //in dirty ER ||b|| = (|b| * |b|-1) /2 + bComparisons = ((next.length) * (next.length-1)) / 2; //cannot be zero, cannot be odd number (no casting needed) + for (VIntWritable neighborId : next) { + if (neighborId.equals(_key)) { + continue; + } + int neighbor = neighborId.get(); + + Double prevWeight = weights.get(neighbor); + if (prevWeight == null) { + prevWeight = 0.0; + } + weights.put(neighbor, prevWeight + 1.0/bComparisons); + } + } + + for (double weight : weights.values()) { //iterate over the weights only (ignore labels) + weightToEmit.set(weight); + output.collect(weightToEmit, one); + } + + + } + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEPEJS.java b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEPEJS.java new file mode 100644 index 0000000..6391e20 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEPEJS.java @@ -0,0 +1,128 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import hadoopUtils.RelativePositionCompression; +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.text.DecimalFormat; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + + +import preprocessing.VIntArrayWritable; + +public class EntityBasedReducerCEPEJS extends MapReduceBase implements Reducer { + + VIntWritable one = new VIntWritable(1); + DoubleWritable weightToEmit = new DoubleWritable(); + + + private Map counters; //key: neighborId, value: #common Blocks + private Map blocksPerEntity; //key: entityId, value: #blocks containing this entity + private Map comparisonsPerEntity; //key: entityId, value: #unique comparisons of this entity + + + private Path[] localFiles; + + long comparisons; + + public void configure (JobConf conf) { + counters = new HashMap<>(); + blocksPerEntity = new HashMap<>(); + comparisonsPerEntity = new HashMap<>(); + + comparisons = conf.getLong("comparisons", 0); //default #comparisons is 0 + + BufferedReader SW; + try { + localFiles = DistributedCache.getLocalCacheFiles(conf); + SW = new BufferedReader(new FileReader(localFiles[0].toString())); + String line; + while ((line = SW.readLine())!= null) { + String[] split = line.split("\t"); + blocksPerEntity.put(Integer.parseInt(split[0]), Integer.parseInt(split[1])); + } + SW.close(); + } catch (FileNotFoundException e) { + System.err.println(e.toString()); + } catch (IOException e) { + System.err.println(e.toString()); + } + + //comparisons per entity + try { + SW = new BufferedReader(new FileReader(localFiles[1].toString())); + String line; + while ((line = SW.readLine())!= null) { + String[] split = line.split("\t"); + comparisonsPerEntity.put(Integer.parseInt(split[0]), Integer.parseInt(split[1])); + } + SW.close(); + } catch (FileNotFoundException e) { + System.err.println(e.toString()); + } catch (IOException e) { + System.err.println(e.toString()); + } + } + + /** + * @param _key an entity id + * @param values the list of arrays with entity ids appearing in a block with the _key entity + * @param output the input with the values deduplicated (i.e., each entity appearing only once) + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + int entityId = _key.get(); + + counters = new HashMap<>(); + while (values.hasNext()) { + VIntWritable[] next = RelativePositionCompression.uncompress(values.next()); + for (VIntWritable neighborId : next) { + if (neighborId.equals(_key)) { + continue; + } + int neighbor = neighborId.get(); + + Double count = counters.get(neighbor); + if (count == null) { + count = 0.0; + } + counters.put(neighbor, count+1); + } + } + + //calculate the weights of the neighbors now + double currEntityWeight = Math.log10((double)comparisons/comparisonsPerEntity.get(entityId)); //pre-calculate this only once + int blocksOfCurrEntity = blocksPerEntity.get(entityId); //pre-calculate this only once + + DecimalFormat df = new DecimalFormat("#.###"); //format doubles to keep only first 4 decimal points (saves space) + + for (int neighborId : counters.keySet()) { + double currentWeight = + (counters.get(neighborId)/(blocksOfCurrEntity+blocksPerEntity.get(neighborId)-counters.get(neighborId))) * + currEntityWeight * + Math.log10((double)comparisons/comparisonsPerEntity.get(neighborId)); + + weightToEmit.set(Double.parseDouble(df.format(currentWeight))); + output.collect(weightToEmit, one); + } + + + } + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEPFinal.java b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEPFinal.java new file mode 100644 index 0000000..aa78697 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEPFinal.java @@ -0,0 +1,136 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import hadoopUtils.RelativePositionCompression; +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.JobID; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.RunningJob; + +import preprocessing.VIntArrayWritable; + +public class EntityBasedReducerCEPFinal extends MapReduceBase implements Reducer { + + public enum Output {NUM_RECORDS}; + + VIntWritable neighborToEmit = new VIntWritable(); + + private double totalBlocks; //for ECBS + private Map counters; //key: neighborId, value: #common Blocks + private Map blocksPerEntity; //key: entityId, value: #blocks containing this entity + + + private Path[] localFiles; + private String weightingScheme = "CBS"; + + double minWeight; + + public void configure (JobConf conf) { + counters = new HashMap<>(); + blocksPerEntity = new HashMap<>(); + + minWeight = Double.parseDouble(conf.get("min", "0.0")); + + weightingScheme = conf.get("weightingScheme", "CBS"); //default weighting scheme is CBS + + if (!weightingScheme.equals("CBS")) { //nothing more is needed for CBS + BufferedReader SW; + try { + localFiles = DistributedCache.getLocalCacheFiles(conf); + SW = new BufferedReader(new FileReader(localFiles[0].toString())); + String line; + while ((line = SW.readLine())!= null) { + String[] split = line.split("\t"); + blocksPerEntity.put(Integer.parseInt(split[0]), Integer.parseInt(split[1])); + } + SW.close(); + } catch (FileNotFoundException e) { + System.err.println(e.toString()); + } catch (IOException e) { + System.err.println(e.toString()); + } + if (weightingScheme.equals("ECBS")) { //then we also need #totalBlocks + try { + JobClient client = new JobClient(conf); + RunningJob parentJob = client.getJob(JobID.forName(conf.get("mapred.job.id"))); + totalBlocks = parentJob.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", + "MAP_INPUT_RECORDS").getCounter(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + } + + /** + * @param _key an entity id + * @param values the list of arrays with entity ids appearing in a block with the _key entity + * @param output the input with the values deduplicated (i.e., each entity appearing only once) + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + int entityId = _key.get(); + + counters = new HashMap<>(); + while (values.hasNext()) { + VIntWritable[] next = RelativePositionCompression.uncompress(values.next()); + for (VIntWritable neighborId : next) { + if (neighborId.equals(_key)) { + continue; + } + int neighbor = neighborId.get(); + + Double count = counters.get(neighbor); + if (count == null) { + count = 0.0; + } + counters.put(neighbor, count+1); + } + } + + //calculate the weights of the neighbors now + for (int neighborId : counters.keySet()) { + double currentWeight; + switch (weightingScheme) { + case "CBS": + currentWeight = counters.get(neighborId);//[neighborId]; // CBS + break; + case "ECBS": + currentWeight = counters.get(neighborId)*Math.log10(totalBlocks/blocksPerEntity.get(entityId))*Math.log10(totalBlocks/blocksPerEntity.get(neighborId)); // ECBS + break; + case "JS": + currentWeight = counters.get(neighborId)/(blocksPerEntity.get(entityId)+blocksPerEntity.get(neighborId)-counters.get(neighborId)); // JS + break; + default: + currentWeight = 0; + } + + if (currentWeight >= minWeight) { + neighborToEmit.set(neighborId); +// output.collect(key, neighborToEmit); + reporter.incrCounter(Output.NUM_RECORDS, 1); //to save disk space + } + } + + + } + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEPFinalARCSClean.java b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEPFinalARCSClean.java new file mode 100644 index 0000000..b65eb1f --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEPFinalARCSClean.java @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import hadoopUtils.RelativePositionCompression; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + + +import preprocessing.VIntArrayWritable; + +public class EntityBasedReducerCEPFinalARCSClean extends MapReduceBase implements Reducer { + + public enum Output {NUM_RECORDS}; + + VIntWritable neighborToEmit = new VIntWritable(); + + double minWeight; + + public void configure (JobConf conf) { + minWeight = Double.parseDouble(conf.get("min", "0.0")); + } + + /** + * @param _key an entity id + * @param values the list of arrays with entity ids appearing in a block with the _key entity + * @param output the input with the values deduplicated (i.e., each entity appearing only once) + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + long bComparisons; + Map weights = new HashMap<>(); //key: neighborId, value: ARCS weightweights = new HashMap<>(); + while (values.hasNext()) { + VIntWritable[] next = RelativePositionCompression.uncompress(values.next()); + //in clean-clean ER ||b|| = positives * negatives + bComparisons = (next.length-1) * next[0].get(); //next.length-1, because next[0] is the number of entities in the other collection, not an entity id + + for (VIntWritable neighborId : next) { + if (neighborId.equals(_key)) { + continue; + } + int neighbor = neighborId.get(); + + Double prevWeight = weights.get(neighbor); + if (prevWeight == null) { + prevWeight = 0.0; + } + weights.put(neighbor, prevWeight + 1.0/bComparisons); + } + } + + //prune the neighbors with weight below minWeight + for (int neighborId : weights.keySet()) { + double currentWeight = weights.get(neighborId); + + if (currentWeight >= minWeight) { + neighborToEmit.set(neighborId); +// output.collect(key, neighborToEmit); //comment out to save disk space + reporter.incrCounter(Output.NUM_RECORDS, 1); //to save disk space (instead of command above) + } + } + + + } + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEPFinalARCSDirty.java b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEPFinalARCSDirty.java new file mode 100644 index 0000000..19092e0 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEPFinalARCSDirty.java @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import hadoopUtils.RelativePositionCompression; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + + +import preprocessing.VIntArrayWritable; + +public class EntityBasedReducerCEPFinalARCSDirty extends MapReduceBase implements Reducer { + + public enum Output {NUM_RECORDS}; + + VIntWritable neighborToEmit = new VIntWritable(); + + double minWeight; + + public void configure (JobConf conf) { + minWeight = Double.parseDouble(conf.get("min", "0.0")); + } + + /** + * @param _key an entity id + * @param values the list of arrays with entity ids appearing in a block with the _key entity + * @param output the input with the values deduplicated (i.e., each entity appearing only once) + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + long bComparisons; + Map weights = new HashMap<>(); //key: neighborId, value: ARCS weightweights = new HashMap<>(); + while (values.hasNext()) { + VIntWritable[] next = RelativePositionCompression.uncompress(values.next()); + //in dirty ER ||b|| = (|b| * |b|-1) /2 + bComparisons = ((next.length) * (next.length-1)) / 2; //cannot be zero + + for (VIntWritable neighborId : next) { + if (neighborId.equals(_key)) { + continue; + } + int neighbor = neighborId.get(); + + Double prevWeight = weights.get(neighbor); + if (prevWeight == null) { + prevWeight = 0.0; + } + weights.put(neighbor, prevWeight + 1.0/bComparisons); + } + } + + //prune the neighbors with weight below minWeight + for (int neighborId : weights.keySet()) { + double currentWeight = weights.get(neighborId); + + if (currentWeight >= minWeight) { + neighborToEmit.set(neighborId); +// output.collect(key, neighborToEmit); //comment out to save disk space + reporter.incrCounter(Output.NUM_RECORDS, 1); //to save disk space (instead of command above) + } + } + + + } + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEPFinalEJS.java b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEPFinalEJS.java new file mode 100644 index 0000000..a959aba --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCEPFinalEJS.java @@ -0,0 +1,129 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + +import hadoopUtils.RelativePositionCompression; +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +import preprocessing.VIntArrayWritable; + +public class EntityBasedReducerCEPFinalEJS extends MapReduceBase implements Reducer { + + public enum Output {NUM_RECORDS}; + + VIntWritable neighborToEmit = new VIntWritable(); + + private Map counters; //key: neighborId, value: #common Blocks + private Map blocksPerEntity; //key: entityId, value: #blocks containing this entity + private Map comparisonsPerEntity; //key: entityId, value: #unique comparisons of this entity + + + private Path[] localFiles; + + long comparisons; + + double minWeight; + + public void configure (JobConf conf) { + minWeight = Double.parseDouble(conf.get("min", "0.0")); + + counters = new HashMap<>(); + blocksPerEntity = new HashMap<>(); + comparisonsPerEntity = new HashMap<>(); + + comparisons = conf.getLong("comparisons", 0); //default #comparisons is 0 + + BufferedReader SW; + try { + localFiles = DistributedCache.getLocalCacheFiles(conf); + SW = new BufferedReader(new FileReader(localFiles[0].toString())); + String line; + while ((line = SW.readLine())!= null) { + String[] split = line.split("\t"); + blocksPerEntity.put(Integer.parseInt(split[0]), Integer.parseInt(split[1])); + } + SW.close(); + } catch (FileNotFoundException e) { + System.err.println(e.toString()); + } catch (IOException e) { + System.err.println(e.toString()); + } + + //comparisons per entity + try { + SW = new BufferedReader(new FileReader(localFiles[1].toString())); + String line; + while ((line = SW.readLine())!= null) { + String[] split = line.split("\t"); + comparisonsPerEntity.put(Integer.parseInt(split[0]), Integer.parseInt(split[1])); + } + SW.close(); + } catch (FileNotFoundException e) { + System.err.println(e.toString()); + } catch (IOException e) { + System.err.println(e.toString()); + } + } + + /** + * @param _key an entity id + * @param values the list of arrays with entity ids appearing in a block with the _key entity + * @param output the input with the values deduplicated (i.e., each entity appearing only once) + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + int entityId = _key.get(); + + counters = new HashMap<>(); + while (values.hasNext()) { + VIntWritable[] next = RelativePositionCompression.uncompress(values.next()); + for (VIntWritable neighborId : next) { + if (neighborId.equals(_key)) { + continue; + } + int neighbor = neighborId.get(); + + Double count = counters.get(neighbor); + if (count == null) { + count = 0.0; + } + counters.put(neighbor, count+1); + } + } + + //calculate the weights of the neighbors now + double currEntityWeight = Math.log10((double)comparisons/comparisonsPerEntity.get(entityId)); //pre-calculate this only once + int blocksOfCurrEntity = blocksPerEntity.get(entityId); //pre-calculate this only once + + for (int neighborId : counters.keySet()) { + double currentWeight = + (counters.get(neighborId)/(blocksOfCurrEntity+blocksPerEntity.get(neighborId)-counters.get(neighborId))) * + currEntityWeight * + Math.log10((double)comparisons/comparisonsPerEntity.get(neighborId)); + + if (currentWeight >= minWeight) { + neighborToEmit.set(neighborId); +// output.collect(key, neighborToEmit); + reporter.incrCounter(Output.NUM_RECORDS, 1); //to save disk space + } + } + + + } + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCNP.java b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCNP.java new file mode 100644 index 0000000..927cc5d --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCNP.java @@ -0,0 +1,164 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import hadoopUtils.RelativePositionCompression; +import hadoopUtils.ValueComparator; +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.TreeMap; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.JobID; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.RunningJob; + +import preprocessing.VIntArrayWritable; + +public class EntityBasedReducerCNP extends MapReduceBase implements Reducer { + + VIntWritable neighborToEmit = new VIntWritable(); + + public enum Output {NUM_RECORDS}; + + private double totalBlocks; //for ECBS + private int k; //for topK + private Map counters; //key: neighborId, value: #common Blocks + private Map blocksPerEntity; //key: entityId, value: #blocks containing this entity + + + private Path[] localFiles; + private String weightingScheme = "CBS"; + + public void configure (JobConf conf) { + counters = new HashMap<>(); + blocksPerEntity = new HashMap<>(); + float BCin = conf.getFloat("BCin", 1.0f); + k = ((Double)Math.floor(BCin - 1)).intValue(); + + weightingScheme = conf.get("weightingScheme", "CBS"); //default weighting scheme is CBS + + if (!weightingScheme.equals("CBS")) { //nothing more is needed for CBS + BufferedReader SW; + try { + localFiles = DistributedCache.getLocalCacheFiles(conf); + SW = new BufferedReader(new FileReader(localFiles[0].toString())); + String line; + while ((line = SW.readLine())!= null) { + String[] split = line.split("\t"); + blocksPerEntity.put(Integer.parseInt(split[0]), Integer.parseInt(split[1])); + } + SW.close(); + } catch (FileNotFoundException e) { + System.err.println(e.toString()); + } catch (IOException e) { + System.err.println(e.toString()); + } + if (weightingScheme.equals("ECBS")) { //then we also need #totalBlocks + try { + JobClient client = new JobClient(conf); + RunningJob parentJob = client.getJob(JobID.forName(conf.get("mapred.job.id"))); + totalBlocks = parentJob.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", + "MAP_INPUT_RECORDS").getCounter(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + } + + /** + * @param _key an entity id + * @param values the list of arrays with entity ids appearing in a block with the _key entity + * @param output the input with the values deduplicated (i.e., each entity appearing only once) + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + int entityId = _key.get(); + + counters = new HashMap<>(); + while (values.hasNext()) { + VIntWritable[] next = RelativePositionCompression.uncompress(values.next()); + for (VIntWritable neighborId : next) { + if (neighborId.equals(_key)) { + continue; + } + int neighbor = neighborId.get(); + + Double count = counters.get(neighbor); + if (count == null) { + count = 0.0; + } + counters.put(neighbor, count+1); + } + } + + //new feature: stop here to save the weight computations, if |neighbors| <= k, since we need the top-k neighbors + if (!weightingScheme.equals("CBS")) { + if (counters.keySet().size() <= k) { //just emit every neighbor + for(Integer neighborId : counters.keySet()) { + neighborToEmit.set(neighborId); +// output.collect(_key, neighborToEmit); //skip writing the actual output + reporter.incrCounter(Output.NUM_RECORDS, 1); //to save space and time + } + return; //stop here + } + } + + + //if k < |neighbors|, continue calculating the weights + Map weights = new HashMap<>(); + + //calculate the weights of the neighbors now + for (int neighborId : counters.keySet()) { + double currentWeight; + switch (weightingScheme) { + case "CBS": + currentWeight = counters.get(neighborId);//[neighborId]; // CBS + break; + case "ECBS": + currentWeight = counters.get(neighborId)*Math.log10(totalBlocks/blocksPerEntity.get(entityId))*Math.log10(totalBlocks/blocksPerEntity.get(neighborId)); // ECBS + break; + case "JS": + currentWeight = counters.get(neighborId)/(blocksPerEntity.get(entityId)+blocksPerEntity.get(neighborId)-counters.get(neighborId)); // JS + break; + default: + currentWeight = 0; + } + + + weights.put(neighborId, currentWeight); + } + + ValueComparator vc = new ValueComparator(weights); //sorts by value (descending) + TreeMap sortedMap = new TreeMap<>(vc); + sortedMap.putAll(weights); + + for(Integer neighborId : sortedMap.keySet()) { + if (k-- == 0) { //continue until k values have been emitted + return; + } + neighborToEmit.set(neighborId); +// output.collect(_key, neighborToEmit); //skip writing the actual output + reporter.incrCounter(Output.NUM_RECORDS, 1); //to save space and time + } + + + + + } + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCNPARCSClean.java b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCNPARCSClean.java new file mode 100644 index 0000000..af50981 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCNPARCSClean.java @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import hadoopUtils.RelativePositionCompression; +import hadoopUtils.ValueComparator; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.TreeMap; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +import preprocessing.VIntArrayWritable; + +public class EntityBasedReducerCNPARCSClean extends MapReduceBase implements Reducer { + + VIntWritable neighborToEmit = new VIntWritable(); + + public enum Output {NUM_RECORDS}; + + private int k; //for topK + + public void configure (JobConf conf) { + float BCin = conf.getFloat("BCin", 1.0f); + k = ((Double)Math.floor(BCin - 1)).intValue(); + } + + /** + * @param _key an entity id + * @param values the list of arrays with entity ids appearing in a block with the _key entity + * @param output the input with the values deduplicated (i.e., each entity appearing only once) + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + long bComparisons; + Map weights = new HashMap<>(); //key: neighborId, value: ARCS weightweights = new HashMap<>(); + while (values.hasNext()) { + VIntWritable[] next = RelativePositionCompression.uncompressFromSecond(values.next()); + //in clean-clean ER ||b|| = positives * negatives + bComparisons = (next.length-1) * next[0].get(); //next.length-1, because next[0] is the number of entities in the other collection, not an entity id + + for (int i = 1; i < next.length; i++) { + VIntWritable neighborId = next[i]; + if (neighborId.equals(_key)) { + continue; + } + int neighbor = neighborId.get(); + + Double prevWeight = weights.get(neighbor); + if (prevWeight == null) { + prevWeight = 0.0; + } + weights.put(neighbor, prevWeight + 1.0/bComparisons); + } + } + + ValueComparator vc = new ValueComparator(weights); //sorts by value (descending) + TreeMap sortedMap = new TreeMap<>(vc); + sortedMap.putAll(weights); + + for(Integer neighborId : sortedMap.keySet()) { + if (k-- == 0) { //continue until k values have been emitted + return; + } + neighborToEmit.set(neighborId); +// output.collect(_key, neighborToEmit); //skip writing the actual output + reporter.incrCounter(Output.NUM_RECORDS, 1); //to save space and time + } + + + + + } + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCNPARCSDirty.java b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCNPARCSDirty.java new file mode 100644 index 0000000..3412aba --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCNPARCSDirty.java @@ -0,0 +1,81 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import hadoopUtils.RelativePositionCompression; +import hadoopUtils.ValueComparator; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.TreeMap; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +import preprocessing.VIntArrayWritable; + +public class EntityBasedReducerCNPARCSDirty extends MapReduceBase implements Reducer { + + VIntWritable neighborToEmit = new VIntWritable(); + + public enum Output {NUM_RECORDS}; + + private int k; //for topK + + public void configure (JobConf conf) { + float BCin = conf.getFloat("BCin", 1.0f); + k = ((Double)Math.floor(BCin - 1)).intValue(); + } + + /** + * @param _key an entity id + * @param values the list of arrays with entity ids appearing in a block with the _key entity + * @param output the input with the values deduplicated (i.e., each entity appearing only once) + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + long bComparisons; + Map weights = new HashMap<>(); //key: neighborId, value: ARCS weightweights = new HashMap<>(); + while (values.hasNext()) { + VIntWritable[] next = RelativePositionCompression.uncompress(values.next()); + //in dirty ER ||b|| = (|b| * |b|-1) /2 + bComparisons = ((next.length) * (next.length-1)) / 2; //cannot be zero + for (VIntWritable neighborId : next) { + if (neighborId.equals(_key)) { + continue; + } + int neighbor = neighborId.get(); + + Double prevWeight = weights.get(neighbor); + if (prevWeight == null) { + prevWeight = 0.0; + } + weights.put(neighbor, prevWeight + 1.0/bComparisons); + } + } + + ValueComparator vc = new ValueComparator(weights); //sorts by value (descending) + TreeMap sortedMap = new TreeMap<>(vc); + sortedMap.putAll(weights); + + for(Integer neighborId : sortedMap.keySet()) { + if (k-- == 0) { //continue until k values have been emitted + return; + } + neighborToEmit.set(neighborId); +// output.collect(_key, neighborToEmit); //skip writing the actual output + reporter.incrCounter(Output.NUM_RECORDS, 1); //to save space and time + } + + + + + } + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCNPEJS.java b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCNPEJS.java new file mode 100644 index 0000000..8a60383 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerCNPEJS.java @@ -0,0 +1,162 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import hadoopUtils.RelativePositionCompression; +import hadoopUtils.ValueComparator; +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.TreeMap; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.JobID; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.RunningJob; + +import entityBased.EntityBasedReducerCNP.Output; + +import preprocessing.VIntArrayWritable; + +public class EntityBasedReducerCNPEJS extends MapReduceBase implements Reducer { + + VIntWritable neighborToEmit = new VIntWritable(); + + public enum Output {NUM_RECORDS}; + + private int k; //for topK + private Map counters; //key: neighborId, value: #common Blocks + private Map blocksPerEntity; //key: entityId, value: #blocks containing this entity + private Map comparisonsPerEntity; //key: entityId, value: #unique comparisons of this entity + + + private Path[] localFiles; + long comparisons; + + public void configure (JobConf conf) { + counters = new HashMap<>(); + blocksPerEntity = new HashMap<>(); + comparisonsPerEntity = new HashMap<>(); + float BCin = conf.getFloat("BCin", 1.0f); + k = ((Double)Math.floor(BCin - 1)).intValue(); + + comparisons = conf.getLong("comparisons", 0); //default #comparisons is 0 + + + //blocks per entity + BufferedReader SW; + try { + localFiles = DistributedCache.getLocalCacheFiles(conf); + SW = new BufferedReader(new FileReader(localFiles[0].toString())); + String line; + while ((line = SW.readLine())!= null) { + String[] split = line.split("\t"); + blocksPerEntity.put(Integer.parseInt(split[0]), Integer.parseInt(split[1])); + } + SW.close(); + } catch (FileNotFoundException e) { + System.err.println(e.toString()); + } catch (IOException e) { + System.err.println(e.toString()); + } + + //comparisons per entity + try { + SW = new BufferedReader(new FileReader(localFiles[1].toString())); + String line; + while ((line = SW.readLine())!= null) { + String[] split = line.split("\t"); + comparisonsPerEntity.put(Integer.parseInt(split[0]), Integer.parseInt(split[1])); + } + SW.close(); + } catch (FileNotFoundException e) { + System.err.println(e.toString()); + } catch (IOException e) { + System.err.println(e.toString()); + } + } + + /** + * @param _key an entity id + * @param values the list of arrays with entity ids appearing in a block with the _key entity + * @param output the input with the values deduplicated (i.e., each entity appearing only once) + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + int entityId = _key.get(); + + counters = new HashMap<>(); + while (values.hasNext()) { + VIntWritable[] next = RelativePositionCompression.uncompress(values.next()); + for (VIntWritable neighborId : next) { + if (neighborId.equals(_key)) { + continue; + } + int neighbor = neighborId.get(); + + Double count = counters.get(neighbor); + if (count == null) { + count = 0.0; + } + counters.put(neighbor, count+1); + } + } + + //new feature: stop here to save the weight computations, if |neighbors| <= k, since we need the top-k neighbors + if (counters.keySet().size() <= k) { //just emit every neighbor + for(Integer neighborId : counters.keySet()) { + neighborToEmit.set(neighborId); +// output.collect(_key, neighborToEmit); //skip writing the actual output + reporter.incrCounter(Output.NUM_RECORDS, 1); //to save space and time + } + return; //stop here + } + + + //if k < |neighbors|, continue calculating the weights + Map weights = new HashMap<>(); + + double currEntityWeight = Math.log10((double)comparisons/comparisonsPerEntity.get(entityId)); //pre-calculate this only once + int blocksOfCurrEntity = blocksPerEntity.get(entityId); //pre-calculate this only once + + //calculate the weights of the neighbors now + for (int neighborId : counters.keySet()) { + double currentWeight = + (counters.get(neighborId)/(blocksOfCurrEntity+blocksPerEntity.get(neighborId)-counters.get(neighborId))) * + currEntityWeight * + Math.log10((double)comparisons/comparisonsPerEntity.get(neighborId)); + + weights.put(neighborId, currentWeight); + } + + ValueComparator vc = new ValueComparator(weights); //sorts by value (descending) + TreeMap sortedMap = new TreeMap<>(vc); + sortedMap.putAll(weights); + + for(Integer neighborId : sortedMap.keySet()) { + if (k-- == 0) { //continue until k values have been emitted + return; + } + neighborToEmit.set(neighborId); +// output.collect(_key, neighborToEmit); //skip writing the actual output + reporter.incrCounter(Output.NUM_RECORDS, 1); //to save space and time + } + + + + + } + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedReducerWEP.java b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerWEP.java new file mode 100644 index 0000000..79fdad4 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerWEP.java @@ -0,0 +1,136 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import hadoopUtils.RelativePositionCompression; + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; + + +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.JobID; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.RunningJob; + +import preprocessing.VIntArrayWritable; + +public class EntityBasedReducerWEP extends MapReduceBase implements Reducer { + + + public enum Output {NUM_RECORDS}; + + private double totalBlocks; //for ECBS + private Map counters; //key: neighborId, value: #common Blocks + private Map blocksPerEntity; //key: entityId, value: #blocks containing this entity + + + private Path[] localFiles; + private String weightingScheme = "CBS"; + + DoubleWritable weightToEmit = new DoubleWritable(); + + private double averageWeight; + + public void configure (JobConf conf) { + counters = new HashMap<>(); + blocksPerEntity = new HashMap<>(); + + averageWeight = Double.parseDouble(conf.get("averageWeight", "0.0")); + + weightingScheme = conf.get("weightingScheme", "CBS"); //default weighting scheme is CBS + + if (!weightingScheme.equals("CBS")) { //nothing more is needed for CBS + BufferedReader SW; + try { + localFiles = DistributedCache.getLocalCacheFiles(conf); + SW = new BufferedReader(new FileReader(localFiles[0].toString())); + String line; + while ((line = SW.readLine())!= null) { + String[] split = line.split("\t"); + blocksPerEntity.put(Integer.parseInt(split[0]), Integer.parseInt(split[1])); + } + SW.close(); + } catch (FileNotFoundException e) { + System.err.println(e.toString()); + } catch (IOException e) { + System.err.println(e.toString()); + } + if (weightingScheme.equals("ECBS")) { //then we also need #totalBlocks + try { + JobClient client = new JobClient(conf); + RunningJob parentJob = client.getJob(JobID.forName(conf.get("mapred.job.id"))); + totalBlocks = parentJob.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", + "MAP_INPUT_RECORDS").getCounter(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + } + + /** + * @param _key an entity id + * @param values the list of arrays with entity ids appearing in a block with the _key entity + * @param output the input with the values deduplicated (i.e., each entity appearing only once) + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + int entityId = _key.get(); + + counters = new HashMap<>(); + while (values.hasNext()) { + VIntWritable[] next = RelativePositionCompression.uncompress(values.next()); + for (VIntWritable neighborId : next) { + if (neighborId.equals(_key)) { + continue; + } + int neighbor = neighborId.get(); + + Double count = counters.get(neighbor); + if (count == null) { + count = 0.0; + } + counters.put(neighbor, count+1); + } + } + + double weight; + for (int neighborId : counters.keySet()) { + switch (weightingScheme) { + case "CBS": + weight = counters.get(neighborId);//[neighborId]; // CBS + break; + case "ECBS": + weight = counters.get(neighborId)*Math.log10(totalBlocks/blocksPerEntity.get(entityId))*Math.log10(totalBlocks/blocksPerEntity.get(neighborId)); // ECBS + break; + case "JS": + weight = counters.get(neighborId)/(blocksPerEntity.get(entityId)+blocksPerEntity.get(neighborId)-counters.get(neighborId)); // JS + break; + default: + weight = 0; + } + + if (weight > averageWeight) { + weightToEmit.set(weight); + reporter.incrCounter(Output.NUM_RECORDS, 1); //to save space, instead of writing to HDFS + } + } + } + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedReducerWEPARCSClean.java b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerWEPARCSClean.java new file mode 100644 index 0000000..c44eb42 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerWEPARCSClean.java @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import hadoopUtils.RelativePositionCompression; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; + +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; +import preprocessing.VIntArrayWritable; + +public class EntityBasedReducerWEPARCSClean extends MapReduceBase implements Reducer { + + public enum Output {NUM_RECORDS}; + + double averageWeight; + + DoubleWritable weightToEmit = new DoubleWritable(); + + public void configure (JobConf conf) { + averageWeight = Double.parseDouble(conf.get("averageWeight", "0.0")); + } + + /** + * @param _key an entity id + * @param values the list of arrays with entity ids appearing in a block with the _key entity + * @param output the input with the values deduplicated (i.e., each entity appearing only once) + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + long bComparisons; + Map weights = new HashMap<>(); //key: neighborId, value: ARCS weightweights = new HashMap<>(); + while (values.hasNext()) { + VIntWritable[] next = RelativePositionCompression.uncompressFromSecond(values.next()); + //in clean-clean ER ||b|| = positives * negatives + bComparisons = (next.length-1) * next[0].get(); //next.length-1, because next[0] is the number of entities in the other collection, not an entity id + + for (int i = 1; i < next.length; i++) { + VIntWritable neighborId = next[i]; + if (neighborId.equals(_key)) { + continue; + } + int neighbor = neighborId.get(); + + Double prevWeight = weights.get(neighbor); + if (prevWeight == null) { + prevWeight = 0.0; + } + weights.put(neighbor, prevWeight + 1.0/bComparisons); + } + } + + for (Map.Entry edge : weights.entrySet()) { //iterate over the weights only (ignore labels) + weightToEmit.set(edge.getValue()); + reporter.incrCounter(Output.NUM_RECORDS, 1); + } + + + + + } + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedReducerWEPARCSDirty.java b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerWEPARCSDirty.java new file mode 100644 index 0000000..fca0063 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerWEPARCSDirty.java @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import hadoopUtils.RelativePositionCompression; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; + +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; +import preprocessing.VIntArrayWritable; + +public class EntityBasedReducerWEPARCSDirty extends MapReduceBase implements Reducer { + + public enum Output {NUM_RECORDS}; + + double averageWeight; + + DoubleWritable weightToEmit = new DoubleWritable(); + + public void configure (JobConf conf) { + averageWeight = Double.parseDouble(conf.get("averageWeight", "0.0")); + } + + /** + * @param _key an entity id + * @param values the list of arrays with entity ids appearing in a block with the _key entity + * @param output the input with the values deduplicated (i.e., each entity appearing only once) + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + long bComparisons; + Map weights = new HashMap<>(); //key: neighborId, value: ARCS weightweights = new HashMap<>(); + while (values.hasNext()) { + VIntWritable[] next = RelativePositionCompression.uncompress(values.next()); + //in dirty ER ||b|| = (|b| * |b|-1) /2 + bComparisons = ((next.length) * (next.length-1)) / 2; //cannot be zero + for (VIntWritable neighborId : next) { + if (neighborId.equals(_key)) { + continue; + } + int neighbor = neighborId.get(); + + Double prevWeight = weights.get(neighbor); + if (prevWeight == null) { + prevWeight = 0.0; + } + weights.put(neighbor, prevWeight + 1.0/bComparisons); + } + } + + for (Map.Entry edge : weights.entrySet()) { //iterate over the weights only (ignore labels) + weightToEmit.set(edge.getValue()); + reporter.incrCounter(Output.NUM_RECORDS, 1); + } + + + + + } + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedReducerWEPEJS.java b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerWEPEJS.java new file mode 100644 index 0000000..8df9c0d --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerWEPEJS.java @@ -0,0 +1,141 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import hadoopUtils.RelativePositionCompression; + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; + + +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.JobID; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.RunningJob; + +import entityBased.EntityBasedReducerWEP.Output; + +import preprocessing.VIntArrayWritable; + +public class EntityBasedReducerWEPEJS extends MapReduceBase implements Reducer { + + VIntWritable neighborToEmit = new VIntWritable(); + + public enum Output {NUM_RECORDS}; + + private Map counters; //key: neighborId, value: #common Blocks + private Map blocksPerEntity; //key: entityId, value: #blocks containing this entity + private Map comparisonsPerEntity; //key: entityId, value: #unique comparisons of this entity + + + private Path[] localFiles; + + long comparisons; + + DoubleWritable weightToEmit = new DoubleWritable(); + private double averageWeight; + + public void configure (JobConf conf) { + counters = new HashMap<>(); + blocksPerEntity = new HashMap<>(); + comparisonsPerEntity = new HashMap<>(); + + comparisons = conf.getLong("comparisons", 0); //default #comparisons is 0 + + averageWeight = Double.parseDouble(conf.get("averageWeight", "0.0")); + + BufferedReader SW; + try { + localFiles = DistributedCache.getLocalCacheFiles(conf); + SW = new BufferedReader(new FileReader(localFiles[0].toString())); + String line; + while ((line = SW.readLine())!= null) { + String[] split = line.split("\t"); + blocksPerEntity.put(Integer.parseInt(split[0]), Integer.parseInt(split[1])); + } + SW.close(); + } catch (FileNotFoundException e) { + System.err.println(e.toString()); + } catch (IOException e) { + System.err.println(e.toString()); + } + + //comparisons per entity + try { + SW = new BufferedReader(new FileReader(localFiles[1].toString())); + String line; + while ((line = SW.readLine())!= null) { + String[] split = line.split("\t"); + comparisonsPerEntity.put(Integer.parseInt(split[0]), Integer.parseInt(split[1])); + } + SW.close(); + } catch (FileNotFoundException e) { + System.err.println(e.toString()); + } catch (IOException e) { + System.err.println(e.toString()); + } + } + + /** + * @param _key an entity id + * @param values the list of arrays with entity ids appearing in a block with the _key entity + * @param output the input with the values deduplicated (i.e., each entity appearing only once) + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + int entityId = _key.get(); + + counters = new HashMap<>(); + while (values.hasNext()) { + VIntWritable[] next = RelativePositionCompression.uncompress(values.next()); + for (VIntWritable neighborId : next) { + if (neighborId.equals(_key)) { + continue; + } + int neighbor = neighborId.get(); + + Double count = counters.get(neighbor); + if (count == null) { + count = 0.0; + } + counters.put(neighbor, count+1); + } + } + + double currEntityWeight = Math.log10((double)comparisons/comparisonsPerEntity.get(entityId)); //pre-calculate this only once + int blocksOfCurrEntity = blocksPerEntity.get(entityId); //pre-calculate this only once + + double weight = 0; + + //calculate the weights of the neighbors now + for (int neighborId : counters.keySet()) { + weight = + (counters.get(neighborId)/(blocksOfCurrEntity+blocksPerEntity.get(neighborId)-counters.get(neighborId))) * + currEntityWeight * + Math.log10((double)comparisons/comparisonsPerEntity.get(neighborId)); + if (weight > averageWeight) { + weightToEmit.set(weight); + reporter.incrCounter(Output.NUM_RECORDS, 1); //to save space, instead of writing to HDFS + } + + } + + + } + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedReducerWNP.java b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerWNP.java new file mode 100644 index 0000000..cb01c52 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerWNP.java @@ -0,0 +1,155 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import hadoopUtils.RelativePositionCompression; + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; + + +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.JobID; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.RunningJob; + +import preprocessing.VIntArrayWritable; + +public class EntityBasedReducerWNP extends MapReduceBase implements Reducer { + + VIntWritable neighborToEmit = new VIntWritable(); + + public enum Output {NUM_RECORDS}; + + private double totalBlocks; //for ECBS + private Map counters; //key: neighborId, value: #common Blocks + private Map blocksPerEntity; //key: entityId, value: #blocks containing this entity + + + private Path[] localFiles; + private String weightingScheme = "CBS"; + + + public void configure (JobConf conf) { + counters = new HashMap<>(); + blocksPerEntity = new HashMap<>(); + + weightingScheme = conf.get("weightingScheme", "CBS"); //default weighting scheme is CBS + + if (!weightingScheme.equals("CBS")) { //nothing more is needed for CBS + BufferedReader SW; + try { + localFiles = DistributedCache.getLocalCacheFiles(conf); + SW = new BufferedReader(new FileReader(localFiles[0].toString())); + String line; + while ((line = SW.readLine())!= null) { + String[] split = line.split("\t"); + blocksPerEntity.put(Integer.parseInt(split[0]), Integer.parseInt(split[1])); + } + SW.close(); + } catch (FileNotFoundException e) { + System.err.println(e.toString()); + } catch (IOException e) { + System.err.println(e.toString()); + } + if (weightingScheme.equals("ECBS")) { //then we also need #totalBlocks + try { + JobClient client = new JobClient(conf); + RunningJob parentJob = client.getJob(JobID.forName(conf.get("mapred.job.id"))); + totalBlocks = parentJob.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", + "MAP_INPUT_RECORDS").getCounter(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + } + + /** + * @param _key an entity id + * @param values the list of arrays with entity ids appearing in a block with the _key entity + * @param output the input with the values deduplicated (i.e., each entity appearing only once) + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + int entityId = _key.get(); + + counters = new HashMap<>(); + while (values.hasNext()) { + VIntWritable[] next = RelativePositionCompression.uncompress(values.next()); + for (VIntWritable neighborId : next) { + if (neighborId.equals(_key)) { + continue; + } + int neighbor = neighborId.get(); + + Double count = counters.get(neighbor); + if (count == null) { + count = 0.0; + } + counters.put(neighbor, count+1); + } + } + + double averageWeight = 0; + + //first loop to find the average weight + for (int neighborId : counters.keySet()) { + double currentWeight; + switch (weightingScheme) { + case "CBS": + currentWeight = counters.get(neighborId);//[neighborId]; // CBS + break; + case "ECBS": + currentWeight = counters.get(neighborId)*Math.log10(totalBlocks/blocksPerEntity.get(entityId))*Math.log10(totalBlocks/blocksPerEntity.get(neighborId)); // ECBS + break; + case "JS": + currentWeight = counters.get(neighborId)/(blocksPerEntity.get(entityId)+blocksPerEntity.get(neighborId)-counters.get(neighborId)); // JS + break; + default: + currentWeight = 0; + } + averageWeight += currentWeight; + } + averageWeight /= counters.keySet().size(); //no of comparisons + + //second loop to emit weights above average + for (int neighborId : counters.keySet()) { + double currentWeight; + switch (weightingScheme) { + case "CBS": + currentWeight = counters.get(neighborId);//[neighborId]; // CBS + break; + case "ECBS": + currentWeight = counters.get(neighborId)*Math.log10(totalBlocks/blocksPerEntity.get(entityId))*Math.log10(totalBlocks/blocksPerEntity.get(neighborId)); // ECBS + break; + case "JS": + currentWeight = counters.get(neighborId)/(blocksPerEntity.get(entityId)+blocksPerEntity.get(neighborId)-counters.get(neighborId)); // JS + break; + default: + currentWeight = 0; + } + + if (averageWeight < currentWeight) { + neighborToEmit.set(neighborId); +// output.collect(_key, neighborToEmit); //skip writing the actual output + reporter.incrCounter(Output.NUM_RECORDS, 1); //to save space + } + } + } + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedReducerWNPARCSClean.java b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerWNPARCSClean.java new file mode 100644 index 0000000..4b76064 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerWNPARCSClean.java @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import hadoopUtils.RelativePositionCompression; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; +import preprocessing.VIntArrayWritable; + +public class EntityBasedReducerWNPARCSClean extends MapReduceBase implements Reducer { + + VIntWritable neighborToEmit = new VIntWritable(); + + public enum Output {NUM_RECORDS}; + + /** + * @param _key an entity id + * @param values the list of arrays with entity ids appearing in a block with the _key entity + * @param output the input with the values deduplicated (i.e., each entity appearing only once) + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + long bComparisons; + Map weights = new HashMap<>(); //key: neighborId, value: ARCS weightweights = new HashMap<>(); + while (values.hasNext()) { + VIntWritable[] next = RelativePositionCompression.uncompressFromSecond(values.next()); + //in clean-clean ER ||b|| = positives * negatives + bComparisons = (next.length-1) * next[0].get(); //next.length-1, because next[0] is the number of entities in the other collection, not an entity id + + for (int i = 1; i < next.length; i++) { + VIntWritable neighborId = next[i]; + if (neighborId.equals(_key)) { + continue; + } + int neighbor = neighborId.get(); + + Double prevWeight = weights.get(neighbor); + if (prevWeight == null) { + prevWeight = 0.0; + } + weights.put(neighbor, prevWeight + 1.0/bComparisons); + } + } + + double averageWeight = 0; + + //first loop to find the average weight + for (int neighborId : weights.keySet()) { + averageWeight += weights.get(neighborId); + } + averageWeight /= weights.keySet().size(); //no of comparisons + + //second loop to emit weights above average + for (int neighborId : weights.keySet()) { + if (averageWeight < weights.get(neighborId)) { + neighborToEmit.set(neighborId); +// output.collect(_key, neighborToEmit); //skip writing the actual output + reporter.incrCounter(Output.NUM_RECORDS, 1); //to save space + } + } + + + + + } + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedReducerWNPARCSDirty.java b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerWNPARCSDirty.java new file mode 100644 index 0000000..b5071b6 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerWNPARCSDirty.java @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import hadoopUtils.RelativePositionCompression; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; +import preprocessing.VIntArrayWritable; + +public class EntityBasedReducerWNPARCSDirty extends MapReduceBase implements Reducer { + + VIntWritable neighborToEmit = new VIntWritable(); + + public enum Output {NUM_RECORDS}; + + /** + * @param _key an entity id + * @param values the list of arrays with entity ids appearing in a block with the _key entity + * @param output the input with the values deduplicated (i.e., each entity appearing only once) + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + long bComparisons; + Map weights = new HashMap<>(); //key: neighborId, value: ARCS weightweights = new HashMap<>(); + while (values.hasNext()) { + VIntWritable[] next = RelativePositionCompression.uncompress(values.next()); + //in dirty ER ||b|| = (|b| * |b|-1) /2 + bComparisons = ((next.length) * (next.length-1)) / 2; //cannot be zero + for (VIntWritable neighborId : next) { + if (neighborId.equals(_key)) { + continue; + } + int neighbor = neighborId.get(); + + Double prevWeight = weights.get(neighbor); + if (prevWeight == null) { + prevWeight = 0.0; + } + weights.put(neighbor, prevWeight + 1.0/bComparisons); + } + } + + double averageWeight = 0; + + //first loop to find the average weight + for (int neighborId : weights.keySet()) { + averageWeight += weights.get(neighborId); + } + averageWeight /= weights.keySet().size(); //no of comparisons + + //second loop to emit weights above average + for (int neighborId : weights.keySet()) { + if (averageWeight < weights.get(neighborId)) { + neighborToEmit.set(neighborId); +// output.collect(_key, neighborToEmit); //skip writing the actual output + reporter.incrCounter(Output.NUM_RECORDS, 1); //to save space + } + } + + + + + } + +} diff --git a/MetaBlocking/src/main/java/entityBased/EntityBasedReducerWNPEJS.java b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerWNPEJS.java new file mode 100644 index 0000000..ab8664d --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/EntityBasedReducerWNPEJS.java @@ -0,0 +1,144 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import hadoopUtils.RelativePositionCompression; + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; + + +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.JobID; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.RunningJob; + +import preprocessing.VIntArrayWritable; + +public class EntityBasedReducerWNPEJS extends MapReduceBase implements Reducer { + + VIntWritable neighborToEmit = new VIntWritable(); + + public enum Output {NUM_RECORDS}; + + private Map counters; //key: neighborId, value: #common Blocks + private Map blocksPerEntity; //key: entityId, value: #blocks containing this entity + private Map comparisonsPerEntity; //key: entityId, value: #unique comparisons of this entity + + + private Path[] localFiles; + long comparisons; + + + public void configure (JobConf conf) { + counters = new HashMap<>(); + blocksPerEntity = new HashMap<>(); + comparisonsPerEntity = new HashMap<>(); + comparisons = conf.getLong("comparisons", 0); //default #comparisons is 0 + + //blocks per entity + BufferedReader SW; + try { + localFiles = DistributedCache.getLocalCacheFiles(conf); + SW = new BufferedReader(new FileReader(localFiles[0].toString())); + String line; + while ((line = SW.readLine())!= null) { + String[] split = line.split("\t"); + blocksPerEntity.put(Integer.parseInt(split[0]), Integer.parseInt(split[1])); + } + SW.close(); + } catch (FileNotFoundException e) { + System.err.println(e.toString()); + } catch (IOException e) { + System.err.println(e.toString()); + } + + //comparisons per entity + try { + SW = new BufferedReader(new FileReader(localFiles[1].toString())); + String line; + while ((line = SW.readLine())!= null) { + String[] split = line.split("\t"); + comparisonsPerEntity.put(Integer.parseInt(split[0]), Integer.parseInt(split[1])); + } + SW.close(); + } catch (FileNotFoundException e) { + System.err.println(e.toString()); + } catch (IOException e) { + System.err.println(e.toString()); + } + + } + + /** + * @param _key an entity id + * @param values the list of arrays with entity ids appearing in a block with the _key entity + * @param output the input with the values deduplicated (i.e., each entity appearing only once) + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + int entityId = _key.get(); + + counters = new HashMap<>(); + while (values.hasNext()) { + VIntWritable[] next = RelativePositionCompression.uncompress(values.next()); + for (VIntWritable neighborId : next) { + if (neighborId.equals(_key)) { + continue; + } + int neighbor = neighborId.get(); + + Double count = counters.get(neighbor); + if (count == null) { + count = 0.0; + } + counters.put(neighbor, count+1); + } + } + + double averageWeight = 0; + + double currEntityWeight = Math.log10((double)comparisons/comparisonsPerEntity.get(entityId)); //pre-calculate this only once + int blocksOfCurrEntity = blocksPerEntity.get(entityId); //pre-calculate this only once + + //first loop to find the average weight + for (int neighborId : counters.keySet()) { + double currentWeight = + (counters.get(neighborId)/(blocksOfCurrEntity+blocksPerEntity.get(neighborId)-counters.get(neighborId))) * + currEntityWeight * + Math.log10((double)comparisons/comparisonsPerEntity.get(neighborId)); + + averageWeight += currentWeight; + } + averageWeight /= counters.keySet().size(); //no of comparisons + + //second loop to emit weights above average + for (int neighborId : counters.keySet()) { + double currentWeight = + (counters.get(neighborId)/(blocksOfCurrEntity+blocksPerEntity.get(neighborId)-counters.get(neighborId))) * + currEntityWeight * + Math.log10((double)comparisons/comparisonsPerEntity.get(neighborId)); + + if (averageWeight < currentWeight) { + neighborToEmit.set(neighborId); +// output.collect(_key, neighborToEmit); //skip writing the actual output + reporter.incrCounter(Output.NUM_RECORDS, 1); //to save space + } + } + } + +} diff --git a/MetaBlocking/src/main/java/entityBased/NodeDegreeDriver.java b/MetaBlocking/src/main/java/entityBased/NodeDegreeDriver.java new file mode 100644 index 0000000..933813e --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/NodeDegreeDriver.java @@ -0,0 +1,99 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.OutputStreamWriter; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.Counters; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RunningJob; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.TextOutputFormat; + +import entityBased.NodeDegreeReducer.Output; + +import preprocessing.VIntArrayWritable; + + +public class NodeDegreeDriver { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(entityBased.NodeDegreeDriver.class); + + conf.setJobName("Node Degrees"); + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(VIntArrayWritable.class); + + conf.setOutputKeyClass(VIntWritable.class); + conf.setOutputValueClass(VIntWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(TextOutputFormat.class); + + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //blocking collection + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //comparisons per entity + + + conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNPClean.class); //Clean-Clean ER +// conf.setMapperClass(entityBased.EntityBasedMapperFromCompressedNP.class); //Dirty + conf.setReducerClass(entityBased.NodeDegreeReducer.class); + + conf.setNumReduceTasks(448); + + conf.setCompressMapOutput(true); + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + conf.setMaxReduceTaskFailuresPercent(10); + conf.set("mapred.reduce.max.attempts", "10"); + conf.set("mapred.max.tracker.failures", "100"); + conf.set("mapred.job.tracker.handler.count", "40"); + + + client.setConf(conf); + RunningJob job = null; + try { + job = JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + + //the following is used only for CNP,CEPTotalOrder but does not create any overhead (keep it always) + if (job == null) { + System.err.println("No job found"); + return; + } + + BufferedWriter bw = null; + try{ + Counters counters = job.getCounters(); + //each comparison is calculated twice (once for each part of the pair) + //no casting needed, since 2 x comparisons cannot be an odd number + Long comparisons = counters.findCounter(Output.NUM_COMPARISONS).getCounter() / 2; + Path pt=new Path("/user/hduser/comparisons.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + bw=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true))); + bw.write(comparisons.toString()); + }catch(Exception e){ + System.err.println(e.toString()); + } finally { + try { bw.close(); } + catch (IOException e) {System.err.println(e.toString());} + } + + } + +} \ No newline at end of file diff --git a/MetaBlocking/src/main/java/entityBased/NodeDegreeReducer.java b/MetaBlocking/src/main/java/entityBased/NodeDegreeReducer.java new file mode 100644 index 0000000..d84ad83 --- /dev/null +++ b/MetaBlocking/src/main/java/entityBased/NodeDegreeReducer.java @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package entityBased; + + +import hadoopUtils.RelativePositionCompression; +import java.io.IOException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; +import preprocessing.VIntArrayWritable; + +public class NodeDegreeReducer extends MapReduceBase implements Reducer { + + VIntWritable numNeighbors = new VIntWritable(); + + public enum Output {NUM_COMPARISONS}; + + /** + * @param _key an entity id + * @param values the list of arrays with entity ids appearing in a block with the _key entity + * @param output number of unique comparisons for the input entity id _key + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + Set neighbors = new HashSet<>(); + while (values.hasNext()) { + VIntWritable[] next = RelativePositionCompression.uncompress(values.next()); + neighbors.addAll(Arrays.asList(next)); //also adds itself in dirty ER + } + + //dirty ER +// numNeighbors.set(neighbors.size()-1); //dirty ER (-1 because neighbors contains the input key) +// reporter.incrCounter(Output.NUM_COMPARISONS, neighbors.size()-1); //dirty ER + + //clean-clean ER + numNeighbors.set(neighbors.size()); //clean-clean ER + reporter.incrCounter(Output.NUM_COMPARISONS, neighbors.size()); //clean-clean ER + + + output.collect(_key, numNeighbors); + } + +} diff --git a/MetaBlocking/src/main/java/hadoopUtils/DescendingDoubleComparator.java b/MetaBlocking/src/main/java/hadoopUtils/DescendingDoubleComparator.java new file mode 100644 index 0000000..71e952b --- /dev/null +++ b/MetaBlocking/src/main/java/hadoopUtils/DescendingDoubleComparator.java @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package hadoopUtils; + +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.io.WritableComparator; + +/** + * Sorts DoubleWritables in descending order + * @author hduser + * + */ +public class DescendingDoubleComparator extends WritableComparator { + + protected DescendingDoubleComparator() { + super(DoubleWritable.class, true); + } + + @SuppressWarnings("rawtypes") + @Override + public int compare(WritableComparable w1, WritableComparable w2) { + DoubleWritable k1 = (DoubleWritable)w1; + DoubleWritable k2 = (DoubleWritable)w2; + return -1 * k1.compareTo(k2); + } + + +} diff --git a/MetaBlocking/src/main/java/hadoopUtils/DescendingVIntComparator.java b/MetaBlocking/src/main/java/hadoopUtils/DescendingVIntComparator.java new file mode 100644 index 0000000..86e079d --- /dev/null +++ b/MetaBlocking/src/main/java/hadoopUtils/DescendingVIntComparator.java @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package hadoopUtils; + +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.io.WritableComparator; + +/** + * Sorts VIntWritables in descending order + * @author hduser + * + */ +public class DescendingVIntComparator extends WritableComparator { + + protected DescendingVIntComparator() { + super(VIntWritable.class, true); + } + + @SuppressWarnings("rawtypes") + @Override + public int compare(WritableComparable w1, WritableComparable w2) { + VIntWritable k1 = (VIntWritable)w1; + VIntWritable k2 = (VIntWritable)w2; + return -1 * k1.compareTo(k2); + } + + +} diff --git a/MetaBlocking/src/main/java/hadoopUtils/InverseReducer.java b/MetaBlocking/src/main/java/hadoopUtils/InverseReducer.java new file mode 100644 index 0000000..f4f15b3 --- /dev/null +++ b/MetaBlocking/src/main/java/hadoopUtils/InverseReducer.java @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package hadoopUtils; + +import java.io.IOException; +import java.util.Iterator; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +public class InverseReducer extends MapReduceBase implements Reducer { + + public void reduce(DoubleWritable key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + while (values.hasNext()) { + output.collect(values.next(), key); + } + } + +} diff --git a/MetaBlocking/src/main/java/hadoopUtils/MBTools.java b/MetaBlocking/src/main/java/hadoopUtils/MBTools.java new file mode 100644 index 0000000..8385487 --- /dev/null +++ b/MetaBlocking/src/main/java/hadoopUtils/MBTools.java @@ -0,0 +1,370 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package hadoopUtils; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; + +import org.apache.hadoop.io.VIntWritable; + + +public class MBTools { + + public enum WeightingScheme {CBS, ECBS, ARCS, JS, EJS}; + + /** + * the inverse of Arrays.toString(), returning a list + * @param stringArray the result of Arrays.toString() + * @return the list represenation of the array that was given as parameter in the original Arrays.toString() method + */ + public static List listFromStringArray(String stringArray) { + List result = new ArrayList<>(); + String[] blocks = stringArray.substring(1, stringArray.lastIndexOf("]")).split(", "); + for (String block : blocks) { + result.add(Integer.parseInt(block)); + } + return result; + } + + public static boolean isRepeated(List blocks1, List blocks2, int blockIndex) { + for (int i = 0; i < blocks1.size(); ++i) { + int block1i = blocks1.get(i); + if (i == 0 && block1i == blockIndex) { //first block in index => cannot be repeated + return false; + } + for (int j = 0; j < blocks2.size(); ++j) { + int block2j = blocks2.get(j); + if (j == 0 && block2j == blockIndex) { //first block in index => cannot be repeated + return false; + } + if (block2j < block1i) { + continue; + } + if (block1i < block2j) { + break; + } + if (block1i == block2j) { + return block1i != blockIndex; + } + } + } + //System.err.println("Error!!!!"); //since this check comes from a common block + return false; + } + + public static boolean isRepeated(List blocks1, List blocks2, int blockIndex, String weightingScheme) { + if (weightingScheme.equals("ARCS")) { + List newBlocks1 = new ArrayList<>(blocks1.size()/2); + for (int i = 0; i < blocks1.size(); i+=2) { + newBlocks1.add(blocks1.get(i)); + } + List newBlocks2 = new ArrayList<>(blocks2.size()/2); + for (int i = 0; i < blocks2.size(); i+=2) { + newBlocks2.add(blocks2.get(i)); + } + return isRepeated(newBlocks1, newBlocks2, blockIndex); + } else if (weightingScheme.equals("EJS")) { + //TODO: fill this code + assert(false); //not implemented yet + return isRepeated(blocks1, blocks2, blockIndex); + } else { + return isRepeated(blocks1, blocks2, blockIndex); + } + } + + public static boolean isRepeated(Integer[] blocks1, Integer[] blocks2, int blockIndex) { + for (int i = 0; i < blocks1.length; ++i) { + Integer block1i = blocks1[i]; + if (i == 0 && block1i == blockIndex) { //first block in index => cannot be repeated + return false; + } + for (int j = 0; j < blocks2.length; ++j) { + Integer block2j = blocks2[j]; + if (j == 0 && block2j == blockIndex) { //first block in index => cannot be repeated + return false; + } + if (block2j < block1i) { + continue; + } + if (block1i < block2j) { + break; + } + if (block1i == block2j) { + return block1i != blockIndex; + } + } + } + //System.err.println("Error!!!!"); //since this check comes from a common block + assert(false) : "Error!No common blocks for two entities of the same block!"; + return false; + } + + public static double getWeight(List blocks1, List blocks2, int blockIndex, String weightingScheme) { + return getWeight(blocks1, blocks2, blockIndex, weightingScheme, 0, 0); + } + + public static double getWeight(List blocks1, List blocks2, int blockIndex, String weightingScheme, int totalBlocks) { + return getWeight(blocks1, blocks2, blockIndex, weightingScheme, totalBlocks, 0); + } + + public static double getWeight(List blocks1, List blocks2, int blockIndex, String weightingScheme, int totalBlocks, long validComparisons) { + switch (weightingScheme) { + case "ARCS": + final Map commonIndices = getCommonBlockIndicesARCS(blocks1, blocks2); + if (commonIndices == null) { + return -1; + } + double totalWeight = 0; + for (Map.Entry commonBlock : commonIndices.entrySet()) { + totalWeight += 1.0 / commonBlock.getValue(); + } + return totalWeight; + case "CBS": + return getNoOfCommonBlocks(blocks1, blocks2); + case "ECBS": + double commonBlocks = getNoOfCommonBlocks(blocks1, blocks2); + if (commonBlocks < 0) { + return commonBlocks; + } + return commonBlocks * Math.log10((double)totalBlocks / blocks1.size()) * Math.log10((double)totalBlocks / blocks2.size()); + case "JS": + double commonBlocksJS = getNoOfCommonBlocks(blocks1, blocks2); + if (commonBlocksJS < 0) { + return commonBlocksJS; + } + return commonBlocksJS / (blocks1.size() + blocks2.size() - commonBlocksJS); + case "EJS": + List actualBlocksE1 = blocks1.subList(0, blocks1.size()-1); //the last value is the cardinality of e1 + List actualBlocksE2 = blocks2.subList(0, blocks2.size()-1); //the last value is the cardinality of e2 + int e1Comparisons = blocks1.get(blocks1.size()-1); + int e2Comparisons = blocks2.get(blocks2.size()-1); + double commonBlocksEJS = getNoOfCommonBlocks(actualBlocksE1, actualBlocksE2); + if (commonBlocksEJS < 0) { + return commonBlocksEJS; + } + double probability = commonBlocksEJS / (actualBlocksE1.size() + actualBlocksE2.size() - commonBlocksEJS); + return probability * Math.log10((double)validComparisons / e1Comparisons) * Math.log10((double)validComparisons / e2Comparisons); + } + return -1; + } + + + + public static double getWeight(int blockIndex, int[] blocks1, int[] blocks2, String weightingScheme) { + return getWeight(blockIndex,blocks1,blocks2,weightingScheme,0,0); + } + + public static double getWeight(int blockIndex, int[] blocks1, int[] blocks2, String weightingScheme, int totalBlocks) { + return getWeight(blockIndex,blocks1,blocks2,weightingScheme,totalBlocks,0); + } + + public static double getWeight(int blockIndex, int[] blocks1, int[] blocks2, String weightingScheme, int totalBlocks, long validComparisons) { + int commonBlocks = 0; + int noOfBlocks1 = blocks1.length; + int noOfBlocks2 = blocks2.length; + for (int i = 0; i < noOfBlocks1; i++) { + for (int j = 0; j < noOfBlocks2; j++) { + if (blocks2[j] < blocks1[i]) { + continue; + } + + if (blocks1[i] < blocks2[j]) { + break; + } + + if (blocks1[i] == blocks2[j]) { + if (commonBlocks == 0 && blocks1[i] != blockIndex) { + return -1; //comparison has been already performed + } + commonBlocks++; + } + } + } + + switch (weightingScheme) { + case "CBS": + return commonBlocks; + case "ECBS": + return commonBlocks * Math.log10((double)totalBlocks / noOfBlocks1) * Math.log10((double)totalBlocks / noOfBlocks2); + case "JS": + return ((double)commonBlocks) / (noOfBlocks1 + noOfBlocks2 - commonBlocks); + case "EJS": + Integer[] actualBlocksE1 = new Integer[blocks1.length-1]; + for (int i=0; i blocks1Set = new HashSet<>(); + Collections.addAll(blocks1Set,blocks1); + for (int block2 : blocks2) { + if (blocks1Set.contains(block2)) { + counter++; + } + } + return counter; + } + + /** + * finds the common blocks of the two block collections and + * returns an array of block sizes for the common blocks + * @param blocks1 the blocks to which e1 belongs, along with their size [block1,|block1|,block2,|block2|,block3,...] + * @param blocks2 the blocks to which e2 belongs, along with their size [block1,|block1|,block2,|block2|,block3,...] + * @return an array of block sizes for the common blocks + */ + private static int[] getCommonBlockIndicesARCS(int[] blocks1, int[] blocks2) { + List indicesOfCommonBlocks = new ArrayList<>(); + + Set blocks1Set = new HashSet<>(); + for (int i = 0; i < blocks1.length; i+=2) { //store blocks1 in a set + blocks1Set.add(blocks1[i]); + } + for (int i = 0; i < blocks2.length; i+=2) { //find the indices (in blocks2) of common blocks + if (blocks1Set.contains(blocks2[i])) { + indicesOfCommonBlocks.add(i); + } + } + int[] result = new int[indicesOfCommonBlocks.size()]; + for (int i = 0; i < result.length; ++i) { + result[i] = blocks2[indicesOfCommonBlocks.get(i)+1];//store the size of each common block in results + } + return result; + } + + + private static Map getCommonBlockIndicesARCS(List blocks1, List blocks2) { + Map newBlocks1 = new HashMap<>(); + for (int i = 0; i < blocks1.size(); i+=2) { + newBlocks1.put(blocks1.get(i), blocks1.get(i+1)); + } + Map newBlocks2 = new HashMap<>(); + for (int i = 0; i < blocks2.size(); i+=2) { + newBlocks2.put(blocks2.get(i), blocks2.get(i+1)); + } + newBlocks1.entrySet().retainAll(newBlocks2.entrySet()); + return newBlocks1; + } + + public static Set getCommonBlockIndices(List blocks1, List blocks2) { + Set intersection = new HashSet<>(blocks1); + intersection.retainAll(blocks2); + return intersection; + + /*List blocks1 = entityIndex.get(e1); + List blocks2 = entityIndex.get(e2); + boolean firstCommonIndex = false; + int noOfBlocks1 = blocks1.size(); + int noOfBlocks2 = blocks2.size(); + final List indices = new ArrayList(); + for (int i = 0; i < noOfBlocks1; i++) { + for (int j = 0; j < noOfBlocks2; j++) { + if (blocks2.get(j) < blocks1.get(i)) { + continue; + } + if (blocks1.get(i) < blocks2.get(j)) { + break; + } + if (blocks1.get(i) == blocks2.get(j)) { + if (!firstCommonIndex) { + firstCommonIndex = true; + if (blocks1.get(i) != blockIndex) { + return null; + } + } + indices.add(blocks1.get(i)); + } + } + } + return indices;*/ + } + + public static int getNoOfCommonBlocks(List blocks1, List blocks2) { + + return getCommonBlockIndices(blocks1, blocks2).size(); + + /* + boolean firstCommonIndex = false; + int commonBlocks = 0; + int noOfBlocks1 = blocks1.size(); + int noOfBlocks2 = blocks2.size(); + for (int i = 0; i < noOfBlocks1; i++) { + int block1i = blocks1.get(i); + for (int j = 0; j < noOfBlocks2; j++) { + int blocks2j = blocks2.get(j); + if (blocks2j < block1i) { + continue; + } + + if (block1i < blocks2j) { + break; + } + + if (block1i == blocks2j) { + commonBlocks++; + if (!firstCommonIndex) { + firstCommonIndex = true; + if (block1i != blockIndex) { + return -1; + } + } + } + } + } + + return commonBlocks; + */ + } + + /*public static int getNoOfEntityBlocks(Map> entityIndex, int entityId) { + List entityBlocks; + if ((entityBlocks = entityIndex.get(entityId)) == null) { + return -1; + } + + return entityBlocks.size(); + }*/ + + public static void main (String[] args) { + int [] blocks1 = new int[]{1,2, 2,3, 5,3, 6,5}; + int [] blocks2 = new int[]{1,2, 5,3, 4,5}; + System.out.println(Arrays.toString(getCommonBlockIndicesARCS(blocks1, blocks2))); + } + +} diff --git a/MetaBlocking/src/main/java/hadoopUtils/MapSortByValue.java b/MetaBlocking/src/main/java/hadoopUtils/MapSortByValue.java new file mode 100644 index 0000000..a767e0e --- /dev/null +++ b/MetaBlocking/src/main/java/hadoopUtils/MapSortByValue.java @@ -0,0 +1,38 @@ +package hadoopUtils; + +import java.util.Collections; +import java.util.Comparator; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; + +/** + * + * @see stackoverflow post + */ +public class MapSortByValue { + + public static > Map + sortByValue( Map map ) +{ + List> list = + new LinkedList<>( map.entrySet() ); + Collections.sort( list, new Comparator>() + { + @Override + public int compare( Map.Entry o1, Map.Entry o2 ) + { + return -(o1.getValue()).compareTo( o2.getValue() ); + } + } ); + + Map result = new LinkedHashMap<>(); + for (Map.Entry entry : list) + { + result.put( entry.getKey(), entry.getValue() ); + } + return result; +} + +} diff --git a/MetaBlocking/src/main/java/hadoopUtils/Partition.java b/MetaBlocking/src/main/java/hadoopUtils/Partition.java new file mode 100644 index 0000000..999c059 --- /dev/null +++ b/MetaBlocking/src/main/java/hadoopUtils/Partition.java @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package hadoopUtils; + +import java.util.HashMap; +import java.util.Map; + +public class Partition { + + private Map blocks; + private long totalComparisons; + + public Partition() { + blocks = new HashMap<>(); + totalComparisons = 0; + } + + /** + * + * @param block a Map entry, whose key is a block Id and value is the num of comparisons in this block + */ + public void addBlock(Map.Entry block) { + blocks.put(block.getKey(),block.getValue()); + totalComparisons += block.getValue(); + } + + public long getTotalComparisons() { + return totalComparisons; + } + + public Map getBlocks() { + return blocks; + } + + + +} diff --git a/MetaBlocking/src/main/java/hadoopUtils/PartitionComparator.java b/MetaBlocking/src/main/java/hadoopUtils/PartitionComparator.java new file mode 100644 index 0000000..b66005b --- /dev/null +++ b/MetaBlocking/src/main/java/hadoopUtils/PartitionComparator.java @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package hadoopUtils; + +import java.util.Comparator; + +public class PartitionComparator implements Comparator { + + @Override + public int compare(Partition partition1, Partition partition2) { + return new Long(partition1.getTotalComparisons()).compareTo(partition2.getTotalComparisons()); + } + + + +} diff --git a/MetaBlocking/src/main/java/hadoopUtils/ReadHadoopStats.java b/MetaBlocking/src/main/java/hadoopUtils/ReadHadoopStats.java new file mode 100644 index 0000000..67311c5 --- /dev/null +++ b/MetaBlocking/src/main/java/hadoopUtils/ReadHadoopStats.java @@ -0,0 +1,59 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package hadoopUtils; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.UnsupportedEncodingException; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLConnection; +import java.net.URLDecoder; + +/** + * + * @author VASILIS + */ +public class ReadHadoopStats { + + /** + * @param args the command line arguments + */ + public static void main(String[] args) throws MalformedURLException, UnsupportedEncodingException, IOException { + final int PARTITIONS = 728; //<--------------CHANGE THIS VALUE EACH TIME + + final String TASKID = "201607021850"; + for (int i = 0; i < PARTITIONS; i++) { + String reduceId = String.format("%06d", i); + System.out.print(i+":"); //i is the partition number + + String urlStr = "http://83.212.123.10:50030/taskstats.jsp?tipid=task_"+TASKID+"_0001_r_"+reduceId; + + URL url = new URL(URLDecoder.decode(urlStr, "UTF-8")); + URLConnection con = url.openConnection(); + + try (BufferedReader in = new BufferedReader(new InputStreamReader(con.getInputStream(), "UTF-8"))) { + String input; + while ((input = in.readLine()) != null) { + if (input.contains("CLEAN_BLOCKS")) { + String nextLine = in.readLine(); + System.out.print(nextLine.substring(nextLine.indexOf(">")+1, nextLine.lastIndexOf("<"))+":"); + } + if (input.contains("COMPARISONS")) { + String nextLine = in.readLine(); + System.out.println(nextLine.substring(nextLine.indexOf(">")+1, nextLine.lastIndexOf("<"))); + continue; + } + } + } + } + + + + } + +} diff --git a/MetaBlocking/src/main/java/hadoopUtils/RelativePositionCompression.java b/MetaBlocking/src/main/java/hadoopUtils/RelativePositionCompression.java new file mode 100644 index 0000000..618a042 --- /dev/null +++ b/MetaBlocking/src/main/java/hadoopUtils/RelativePositionCompression.java @@ -0,0 +1,257 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package hadoopUtils; + +import java.text.DecimalFormat; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Set; +import java.util.TreeSet; + +import org.apache.hadoop.io.VIntWritable; + +import preprocessing.VIntArrayWritable; + +public class RelativePositionCompression { + + /** + * Returns a compression of the input set with length equal to input collection's size, + * containing all the elements of uncompressed as their difference to their previous element.
+ * Example: [1, 3, 14, 17, 25, 40] --> [1, 2, 11, 3, 8, 15] + * @param uncompressed the set to be compressed + * @return a compressed array representation of the uncompressed collection + */ + public static VIntArrayWritable compress(Set uncompressed) { + Set blocks = new TreeSet<>(uncompressed); //in case they are unordered + + + VIntWritable[] compressed = new VIntWritable[blocks.size()]; + int i = 0; + int prevBlock = 0; + int currBlock; + Iterator it = blocks.iterator(); + while (it.hasNext()) { + currBlock = it.next().get(); + compressed[i++] = new VIntWritable(currBlock - prevBlock); + prevBlock = currBlock; + } + + return new VIntArrayWritable(compressed); + } + + + + /** + * Returns a compression of the input set with length equal to input collection's size, + * containing all the elements of uncompressed as their difference to their previous element.
+ * Example: [1, 3, 14, 17, 25, 40] --> [1, 2, 11, 3, 8, 15] + * @param uncompressed the array to be compressed + * @return a compressed array representation of the uncompressed array + */ + public static VIntArrayWritable compress(VIntArrayWritable uncompressed) { + VIntWritable[] input = uncompressed.get(); + return compress(input); + } + + /** + * Returns a compression of the input set with length equal to input collection's size, + * containing all the elements of uncompressed as their difference to their previous element.
+ * IMPORTANT: to save time, it requires that the input uncompressed array is sorted + * Example: [1, 3, 14, 17, 25, 40] --> [1, 2, 11, 3, 8, 15] + * @param uncompressed the (sorted!) array to be compressed + * @return a compressed array representation of the uncompressed array + */ + public static VIntArrayWritable compress(VIntWritable[] uncompressed) { + if (uncompressed.length < 2) { //no change applicable => return the input + return new VIntArrayWritable(uncompressed); + } + //Arrays.sort(uncompressed); //assume input uncompressed array is sorted + + VIntWritable[] compressed = new VIntWritable[uncompressed.length]; + + int currElement; + int prevElement = uncompressed[0].get(); + compressed[0] = uncompressed[0]; + for (int i = 1; i< compressed.length; ++i) { + currElement = uncompressed[i].get(); + compressed[i] = new VIntWritable(currElement - prevElement); + prevElement = currElement; + } + + return new VIntArrayWritable(compressed); + } + + + public static VIntArrayWritable compressFromSecond(VIntWritable[] uncompressed) { + if (uncompressed.length < 3) { //no change applicable => return the input + return new VIntArrayWritable(uncompressed); + } + //Arrays.sort(uncompressed); //assume input uncompressed array is sorted + + VIntWritable[] compressed = new VIntWritable[uncompressed.length]; + compressed[0] = uncompressed[0]; //ignore the first element + + int currElement; + int prevElement = uncompressed[1].get(); + compressed[1] = uncompressed[1]; + for (int i = 2; i< compressed.length; ++i) { + currElement = uncompressed[i].get(); + compressed[i] = new VIntWritable(currElement - prevElement); + prevElement = currElement; + } + + return new VIntArrayWritable(compressed); + } + + + + + + /** + * Uncompresses the input set with length equal to input array's length, + * containing all the elements of compressed by adding the current compressed element to its previous uncompressed element.
+ * Example: [1, 2, 11, 3, 8, 15] --> [1, 3, 14, 17, 25, 40] + * @param compressed the array to be uncompressed + * @return an uncompressed representation of compressed array + */ + public static VIntWritable[] uncompress(VIntArrayWritable compressed) { + VIntWritable[] compressedArray = compressed.get(); + if (compressedArray.length < 2) { + return compressed.get(); + } + VIntWritable[] uncompressed = new VIntWritable[compressedArray.length]; + + int prevElement = compressedArray[0].get(); + int currElement; + uncompressed[0] = compressedArray[0]; + for (int i = 1; i < compressedArray.length; ++i) { + currElement = prevElement + compressedArray[i].get(); + prevElement = currElement; + uncompressed[i] = new VIntWritable(currElement); + } + + return uncompressed; + } + + + public static VIntWritable[] uncompressFromSecond(VIntArrayWritable compressed) { + VIntWritable[] compressedArray = compressed.get(); + if (compressedArray.length < 3) { + return compressed.get(); + } + VIntWritable[] uncompressed = new VIntWritable[compressedArray.length]; + + //ignore the first element + uncompressed[0] = compressedArray[0]; + + int prevElement = compressedArray[1].get(); + int currElement; + uncompressed[1] = compressedArray[1]; + for (int i = 2; i < compressedArray.length; ++i) { + currElement = prevElement + compressedArray[i].get(); + prevElement = currElement; + uncompressed[i] = new VIntWritable(currElement); + } + + return uncompressed; + } + + + + + /** + * Uncompresses the input set with length equal to input array's length, + * containing all the elements of compressed by adding the current compressed element to its previous uncompressed element.
+ * Example: [1, 2, 11, 3, 8, 15] --> [1, 3, 14, 17, 25, 40] + * @param compressed the array to be uncompressed + * @return an uncompressed representation of compressed array + */ + public static Integer[] uncompress(Integer[] compressed) { + if (compressed.length < 1) { + return null; + } + Integer[] uncompressed = new Integer[compressed.length]; + + Integer prevBlock = compressed[0]; + Integer currBlock; + uncompressed[0] = prevBlock; + for (int i = 1; i < compressed.length; ++i) { + currBlock = prevBlock + compressed[i]; + prevBlock = currBlock; + uncompressed[i] = currBlock; + } + + return uncompressed; + } + + + /** + * Uncompresses the input set with length equal to input array's length, + * containing all the elements of compressed by adding the current compressed element to its previous uncompressed element.
+ * Example: [1, 2, 11, 3, 8, 15] --> [1, 3, 14, 17, 25, 40] + * @param compressed the array to be uncompressed + * @return an uncompressed representation of compressed array + */ + public static VIntArrayWritable uncompress(Collection compressed) { + Integer[] compressedArray = new Integer[compressed.size()]; + compressedArray = compressed.toArray(compressedArray); + if (compressedArray.length < 1) { + return null; + } + VIntWritable[] uncompressed = new VIntWritable[compressedArray.length]; + + int prevBlock = compressedArray[0]; + int currBlock; + uncompressed[0] = new VIntWritable(prevBlock); + for (int i = 1; i < compressedArray.length; ++i) { + currBlock = prevBlock + compressedArray[i]; + prevBlock = currBlock; + uncompressed[i] = new VIntWritable(currBlock); + } + + return new VIntArrayWritable(uncompressed); + } + + public static void main (String[] args) { + Set blocks = new TreeSet<>(); + blocks.add(new VIntWritable(3)); + blocks.add(new VIntWritable(1)); + blocks.add(new VIntWritable(17)); + blocks.add(new VIntWritable(25)); + blocks.add(new VIntWritable(40)); + blocks.add(new VIntWritable(14)); + System.out.println(Arrays.toString(blocks.toArray())); + + VIntWritable[] compressed = compress(blocks).get(); + System.out.println(Arrays.toString(compressed)); + System.out.println(Arrays.toString(uncompress(new VIntArrayWritable(compressed)))); + + List test = new ArrayList<>(compressed.length); + for (int i = 0; i < compressed.length; ++i) { + test.add(compressed[i].get()); + } + System.out.println(Arrays.toString(uncompress(test).get())); + + System.out.println("Testing negative arrays:"); + VIntWritable[] blocksArray = new VIntWritable[6]; + blocksArray[0] = new VIntWritable(-3); + blocksArray[1] = new VIntWritable(-1); + blocksArray[2] = new VIntWritable(-17); + blocksArray[3] = new VIntWritable(-25); + blocksArray[4] = new VIntWritable(-40); + blocksArray[5] = new VIntWritable(-14); + Arrays.sort(blocksArray, Collections.reverseOrder()); + System.out.println(Arrays.toString(blocksArray)); + + VIntArrayWritable compressed2 = compress(blocksArray); + System.out.println(Arrays.toString(compressed2.get())); + System.out.println(Arrays.toString(uncompress(compressed2))); + + } + +} diff --git a/MetaBlocking/src/main/java/hadoopUtils/ValueComparator.java b/MetaBlocking/src/main/java/hadoopUtils/ValueComparator.java new file mode 100644 index 0000000..0e19de6 --- /dev/null +++ b/MetaBlocking/src/main/java/hadoopUtils/ValueComparator.java @@ -0,0 +1,21 @@ +package hadoopUtils; + +import java.util.Comparator; +import java.util.Map; + +public class ValueComparator implements Comparator { + + Map base; + public ValueComparator(Map base) { + this.base = base; + } + + // Note: this comparator imposes orderings that are inconsistent with equals. + public int compare(Integer a, Integer b) { + if (base.get(a) >= base.get(b)) { + return -1; + } else { + return 1; + } // returning 0 would merge keys + } +} diff --git a/MetaBlocking/src/main/java/preprocessing/AfterFilteringBlockSizeByteCounterMapper.java b/MetaBlocking/src/main/java/preprocessing/AfterFilteringBlockSizeByteCounterMapper.java new file mode 100644 index 0000000..4104cb9 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/AfterFilteringBlockSizeByteCounterMapper.java @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + + +import java.io.IOException; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +import preprocessing.VIntArrayWritable; + + +public class AfterFilteringBlockSizeByteCounterMapper extends MapReduceBase implements Mapper { + + static enum InputData {NOT_AN_ENTITY, NULL_PREFIX_ID, MALFORMED_PAIRS}; + + Text outputValue = new Text(); + VIntWritable one = new VIntWritable(1); + /** + * maps an input entity index into (blockId, 1) pair(s) + * the value is the entity id (input key) along with the num of blocks that contain it + * the key each time is a block id (each element of the input value array) + * @param key an entity id + * @param value an array of block ids that this entity belongs to + * @param output key: a block id (each element of the input value array) - value: the entity id (input key) + */ + public void map(VIntWritable key, VIntArrayWritable value, + OutputCollector output, Reporter reporter) throws IOException { + + VIntWritable [] Bi = value.get(); + for (VIntWritable bi : Bi) { + output.collect(bi, key); + } + + } + +} diff --git a/MetaBlocking/src/main/java/preprocessing/AfterFilteringBlockSizeByteCounterReducer.java b/MetaBlocking/src/main/java/preprocessing/AfterFilteringBlockSizeByteCounterReducer.java new file mode 100644 index 0000000..7d49d81 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/AfterFilteringBlockSizeByteCounterReducer.java @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + + +import hadoopUtils.RelativePositionCompression; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Set; +import java.util.TreeSet; + +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.io.VLongWritable; +import org.apache.hadoop.io.WritableUtils; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +public class AfterFilteringBlockSizeByteCounterReducer extends MapReduceBase implements Reducer { + + public enum OUTPUT_COUNTER {COMPARISONS}; + + private VLongWritable zero = new VLongWritable(0); + private VLongWritable finalSum = new VLongWritable(); + + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + //separate positives from negatives (for clean-clean) + List positives = new ArrayList<>(); + List negatives = new ArrayList<>(); + + while (values.hasNext()) { + VIntWritable next = values.next(); + if (next.get() < 0) { + negatives.add(next); + } else { + positives.add(next); + } + } + + if (positives.isEmpty() || negatives.isEmpty()) { + output.collect(_key, zero); + return; //purged block (no comparisons) -> emit 0 comparisons + } + + Collections.sort(positives); //sort positives in ascending order + Collections.sort(negatives, Collections.reverseOrder()); //sort negatives in descending order (saves more space in compression) + + //compress the two arrays once + VIntWritable[] positivesArray = new VIntWritable[positives.size()]; + VIntWritable[] negativesArray = new VIntWritable[negatives.size()]; + VIntArrayWritable positiveEntities = RelativePositionCompression.compress(positives.toArray(positivesArray)); + VIntArrayWritable negativeEntities = RelativePositionCompression.compress(negatives.toArray(negativesArray)); + + long positivesSizeInBytes = 0; + for (VIntWritable tmp : positiveEntities.get()) { + positivesSizeInBytes += WritableUtils.getVIntSize(tmp.get()); + } + positivesSizeInBytes *= negatives.size(); + + + long negativesSizeInBytes = 0; + for (VIntWritable tmp : negativeEntities.get()) { + negativesSizeInBytes += WritableUtils.getVIntSize(tmp.get()); + } + negativesSizeInBytes *= positives.size(); + + reporter.incrCounter(OUTPUT_COUNTER.COMPARISONS, (long) positives.size() * negatives.size()); + + finalSum.set(positivesSizeInBytes+negativesSizeInBytes); + output.collect(_key, finalSum); + + + } + + +} diff --git a/MetaBlocking/src/main/java/preprocessing/AfterFilteringBlockSizeByteCounterReducerDirty.java b/MetaBlocking/src/main/java/preprocessing/AfterFilteringBlockSizeByteCounterReducerDirty.java new file mode 100644 index 0000000..d4d7656 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/AfterFilteringBlockSizeByteCounterReducerDirty.java @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + + +import hadoopUtils.RelativePositionCompression; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Set; +import java.util.TreeSet; + +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.io.VLongWritable; +import org.apache.hadoop.io.WritableUtils; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +public class AfterFilteringBlockSizeByteCounterReducerDirty extends MapReduceBase implements Reducer { + + public enum OUTPUT_COUNTER {COMPARISONS}; + + private VLongWritable zero = new VLongWritable(0); + private VLongWritable finalSum = new VLongWritable(); + + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + List entities = new ArrayList<>(); + + while (values.hasNext()) { + VIntWritable next = values.next(); + entities.add(next); + } + + if (entities.isEmpty()) { + output.collect(_key, zero); + return; //purged block (no comparisons) -> emit 0 comparisons + } + + Collections.sort(entities); //sort entities in ascending order + + //compress the two arrays once + VIntWritable[] entitiesArray = new VIntWritable[entities.size()]; + VIntArrayWritable allEntities = RelativePositionCompression.compress(entities.toArray(entitiesArray)); + + long sizeInBytes = 0; + for (VIntWritable tmp : allEntities.get()) { + sizeInBytes += WritableUtils.getVIntSize(tmp.get()); + } + reporter.incrCounter(OUTPUT_COUNTER.COMPARISONS, (long)(entities.size() * (long)entities.size()-1)/2); + + finalSum.set(sizeInBytes*entities.size()); + output.collect(_key, finalSum); + + + } + + +} diff --git a/MetaBlocking/src/main/java/preprocessing/AfterFilteringBlockSizeCounterMapper.java b/MetaBlocking/src/main/java/preprocessing/AfterFilteringBlockSizeCounterMapper.java new file mode 100644 index 0000000..d25ad22 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/AfterFilteringBlockSizeCounterMapper.java @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + + +import java.io.IOException; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +import preprocessing.VIntArrayWritable; + + +public class AfterFilteringBlockSizeCounterMapper extends MapReduceBase implements Mapper { + + static enum InputData {NOT_AN_ENTITY, NULL_PREFIX_ID, MALFORMED_PAIRS}; + + Text outputValue = new Text(); + VIntWritable one = new VIntWritable(1); + /** + * maps an input entity index into (blockId, 1) pair(s) + * the value is the entity id (input key) along with the num of blocks that contain it + * the key each time is a block id (each element of the input value array) + * @param key an entity id + * @param value an array of block ids that this entity belongs to + * @param output key: a block id (each element of the input value array) - value: 1, for each entity in this block + */ + public void map(VIntWritable key, VIntArrayWritable value, + OutputCollector output, Reporter reporter) throws IOException { + + VIntWritable [] Bi = value.get(); + for (VIntWritable bi : Bi) { + output.collect(bi, one); + } + + } + +} diff --git a/MetaBlocking/src/main/java/preprocessing/AfterFilteringBlockSizeCounterReducer.java b/MetaBlocking/src/main/java/preprocessing/AfterFilteringBlockSizeCounterReducer.java new file mode 100644 index 0000000..c16f78c --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/AfterFilteringBlockSizeCounterReducer.java @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + + +import java.io.IOException; +import java.util.Iterator; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +public class AfterFilteringBlockSizeCounterReducer extends MapReduceBase implements Reducer { + + + private VIntWritable finalSum = new VIntWritable(); + + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + int sum = 0; + while (values.hasNext()) { + sum += values.next().get(); + } + + finalSum.set(sum); + output.collect(_key, finalSum); //also outputs finalSum 1 + + + } + + +} diff --git a/MetaBlocking/src/main/java/preprocessing/AfterFilteringByteCounter.java b/MetaBlocking/src/main/java/preprocessing/AfterFilteringByteCounter.java new file mode 100644 index 0000000..7fc4ea5 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/AfterFilteringByteCounter.java @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.io.VLongWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.TextOutputFormat; + +public class AfterFilteringByteCounter { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(preprocessing.AfterFilteringByteCounter.class); + + conf.setJobName("Block Size Counter from Entity Index"); + + conf.setOutputKeyClass(VIntWritable.class); + conf.setOutputValueClass(VLongWritable.class); + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(VIntWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(TextOutputFormat.class); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //entity index + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //block sizes (in bytes) + + conf.setMapperClass(preprocessing.AfterFilteringBlockSizeByteCounterMapper.class); + conf.setReducerClass(preprocessing.AfterFilteringBlockSizeByteCounterReducer.class); + conf.setReducerClass(preprocessing.AfterFilteringBlockSizeByteCounterReducerDirty.class); + + + conf.setNumReduceTasks(56); + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} diff --git a/MetaBlocking/src/main/java/preprocessing/AfterFilteringCounter.java b/MetaBlocking/src/main/java/preprocessing/AfterFilteringCounter.java new file mode 100644 index 0000000..062f705 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/AfterFilteringCounter.java @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.TextOutputFormat; + +public class AfterFilteringCounter { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(preprocessing.AfterFilteringCounter.class); + + conf.setJobName("Block Size Counter from Entity Index"); + + conf.setOutputKeyClass(VIntWritable.class); + conf.setOutputValueClass(VIntWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(TextOutputFormat.class); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //entity index + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //block sizes + + conf.setMapperClass(preprocessing.AfterFilteringBlockSizeCounterMapper.class); + conf.setCombinerClass(preprocessing.AfterFilteringBlockSizeCounterReducer.class); + conf.setReducerClass(preprocessing.AfterFilteringBlockSizeCounterReducer.class); + + conf.setNumReduceTasks(56); + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} diff --git a/MetaBlocking/src/main/java/preprocessing/BasicEntityPruningCombiner.java b/MetaBlocking/src/main/java/preprocessing/BasicEntityPruningCombiner.java new file mode 100644 index 0000000..7280b04 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/BasicEntityPruningCombiner.java @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + + +import java.io.IOException; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; +import java.util.TreeSet; + +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +public class BasicEntityPruningCombiner extends MapReduceBase implements Reducer { + + VIntWritable fakeId = new VIntWritable(-1); + VIntArrayWritable toEmit = new VIntArrayWritable(); + + Set nonSingulars; + public void configure (JobConf job) { + nonSingulars = new HashSet<>(); + } + + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + Set entities = new TreeSet<>(); + + boolean first = true; + boolean nonSingular = nonSingulars.contains(_key); //initially set to false + + while (values.hasNext()) { + VIntArrayWritable nextValue = values.next(); + if (first && !values.hasNext() && !nonSingular) { //only one value => no repeated comparisons yet + output.collect(_key, nextValue); //just output the input + return; + } + first = false; + VIntWritable[] next = nextValue.get(); + + for (VIntWritable entity : next) { + reporter.progress(); + if (entity.equals(fakeId)) { continue; } + if (entities.add(entity) == false) { //entity is nonSingular + nonSingular = true; + nonSingulars.add(entity); + } + } + } + + if (nonSingular) { + entities.removeAll(nonSingulars); //entities are now only singulars (i.e. without known nonSingulars) + reporter.progress(); + entities.add(fakeId); //to denote that this entity is nonSingular + nonSingulars.add(_key); + } + + VIntWritable[] entitiesArray = new VIntWritable[entities.size()]; + entitiesArray = entities.toArray(entitiesArray); + toEmit.set(entitiesArray); + output.collect(_key, toEmit); + + } +} diff --git a/MetaBlocking/src/main/java/preprocessing/BasicEntityPruningDriver.java b/MetaBlocking/src/main/java/preprocessing/BasicEntityPruningDriver.java new file mode 100644 index 0000000..6586adc --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/BasicEntityPruningDriver.java @@ -0,0 +1,65 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.io.compress.BZip2Codec; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.TextOutputFormat; + + +public class BasicEntityPruningDriver { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(preprocessing.BasicEntityPruningDriver.class); + + conf.setJobName("Basic Entity Pruning (1rst job)"); + + conf.setOutputKeyClass(VIntWritable.class); + conf.setOutputValueClass(NullWritable.class); + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(VIntArrayWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(TextOutputFormat.class); + + conf.setCompressMapOutput(true); + conf.setMapOutputCompressorClass(BZip2Codec.class); //slowest, highest compression ratio + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //blocking collection + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //block construction without singulars + + conf.setMapperClass(preprocessing.BasicEntityPruningMapper.class); + //conf.setCombinerClass(preprocessing.BasicEntityPruningCombiner.class); + conf.setReducerClass(preprocessing.EntityPruningReducer.class); + //conf.setReducerClass(preprocessing.BasicEntityPruningReducerNew.class); + + + conf.set("mapred.reduce.slowstart.completed.maps", "1.0"); + conf.setInt("io.sort.mb", 900); //default 100 + conf.setFloat("io.sort.spill.percent", 0.9f); //default 0.8 +// conf.setFloat("io.sort.record.percent", 0.01f); //default 0.05 + conf.setInt("io.sort.factor", 500); //default 10 + conf.setInt("mapred.task.timeout", 1800000); + conf.setNumReduceTasks(224); + //conf.setNumReduceTasks(0); + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} diff --git a/MetaBlocking/src/main/java/preprocessing/BasicEntityPruningMapper.java b/MetaBlocking/src/main/java/preprocessing/BasicEntityPruningMapper.java new file mode 100644 index 0000000..7287d77 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/BasicEntityPruningMapper.java @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + + +import java.io.IOException; +import java.util.Arrays; +import java.util.Set; +import java.util.StringTokenizer; +import java.util.TreeSet; + +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +public class BasicEntityPruningMapper extends MapReduceBase implements Mapper { + + VIntArrayWritable toEmitFinal = new VIntArrayWritable(); + static enum InputData {NON_SINGLETON_INPUT, NON_SINGLETON_FOUND}; + + /** + * input: a blocking collection + * input key: block id + * input value: entity ids in this block, separated by "," + * output key: entity id (each of the input values) + * output value: entity ids separated by " " (neighbors of output key) + */ + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + + reporter.setStatus("splitting the block "+key); + Set entities = new TreeSet<>(); //dirty ER, sorts entity ids in ascending order + + StringTokenizer tok = new StringTokenizer(value.toString(),"#"); + + for (Integer entity = Integer.parseInt(tok.nextToken()); tok.hasMoreElements(); entity=Integer.parseInt(tok.nextToken())) { + if (entity == null) { continue; } + + entities.add(new VIntWritable(entity)); + reporter.progress(); + } + + if (entities.size() < 2) { + return; + } + + VIntWritable[] entitiesArray = new VIntWritable[entities.size()]; + entitiesArray = entities.toArray(entitiesArray); + entities.clear(); //not needed anymore, free some memory + + int currEntityIndex = 0; + for (VIntWritable entity : entitiesArray) { + if (currEntityIndex + 1 == entitiesArray.length) { return; } + reporter.setStatus((currEntityIndex+1) +"/"+entitiesArray.length+" block "+key); + toEmitFinal.set(Arrays.copyOfRange(entitiesArray, currEntityIndex+1, entitiesArray.length)); + output.collect(entity, toEmitFinal); //toEmitFinal only contains greater entity ids than the key entity's id + currEntityIndex++; + } + + } + + +} diff --git a/MetaBlocking/src/main/java/preprocessing/BasicEntityPruningReducer.java b/MetaBlocking/src/main/java/preprocessing/BasicEntityPruningReducer.java new file mode 100644 index 0000000..0869a65 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/BasicEntityPruningReducer.java @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + + +import java.io.IOException; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +public class BasicEntityPruningReducer extends MapReduceBase implements Reducer { + + +private final NullWritable NULL = NullWritable.get(); + + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + Set entities = new HashSet<>(); + boolean first = true; + + while (values.hasNext()) { + VIntArrayWritable nextValue = values.next(); + if (first && !values.hasNext()) { //only one value => no repeated comparisons => singular + return; + } + first = false; + VIntWritable[] next = nextValue.get(); + + for (VIntWritable entity : next) { + if (entities.add(entity) == false) { //entity is nonSingular + output.collect(_key, NULL); //emit the entity as nonSingular (only once) + return; + } + + } + } + } + + /*public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + Set entities = new HashSet<>(); + + while (values.hasNext()) { + + VIntWritable next = values.next(); + + if (next.get() == -1) { + output.collect(_key, NULL); + return; + } + + if (entities.add(next) == false) { //entity is nonSingular + output.collect(_key, NULL); //emit the entity as nonSingular (only once) + return; + } + + } + }*/ +} diff --git a/MetaBlocking/src/main/java/preprocessing/BasicEntityPruningReducerNew.java b/MetaBlocking/src/main/java/preprocessing/BasicEntityPruningReducerNew.java new file mode 100644 index 0000000..1aa3cfd --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/BasicEntityPruningReducerNew.java @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + + +import java.io.IOException; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; +import java.util.TreeSet; + +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +public class BasicEntityPruningReducerNew extends MapReduceBase implements Reducer { + + + private final NullWritable NULL = NullWritable.get(); + + VIntWritable fakeId = new VIntWritable(-1); + + Set nonSingulars; + public void configure (JobConf job) { + nonSingulars = new HashSet<>(); + } + + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + Set entities = new TreeSet<>(); + + boolean first = true; + boolean nonSingular = nonSingulars.contains(_key); //initially set to false + + while (values.hasNext()) { + VIntArrayWritable nextValue = values.next(); + if (first && !values.hasNext() && !nonSingular) { //only one value => no repeated comparisons => singular entity + return; + } + first = false; + VIntWritable[] next = nextValue.get(); + + for (VIntWritable entity : next) { + reporter.progress(); + if (entity.equals(fakeId)) { + nonSingular = true; + continue; + } + if (entities.add(entity) == false) { //entity is nonSingular + nonSingular = true; + reporter.progress(); + if (nonSingulars.add(entity) == true) { //added for the first time (at most once for each reducer) + output.collect(entity, NULL); //at most once for each reducer + } + } + } + } + + if (nonSingular) { + if (nonSingulars.add(_key) == true) { //added for the first time (at most once for each reducer) + output.collect(_key, NULL); //at most once for each reducer + } + } + + + } +} diff --git a/MetaBlocking/src/main/java/preprocessing/BlockSizeCounterDriver.java b/MetaBlocking/src/main/java/preprocessing/BlockSizeCounterDriver.java new file mode 100644 index 0000000..457a09f --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/BlockSizeCounterDriver.java @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.TextInputFormat; +import org.apache.hadoop.mapred.TextOutputFormat; +import org.apache.hadoop.mapred.lib.IdentityReducer; + + +public class BlockSizeCounterDriver { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(preprocessing.BlockSizeCounterDriver.class); + + conf.setJobName("Block Size Counter"); + + conf.setOutputKeyClass(VIntWritable.class); + conf.setOutputValueClass(VIntWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(TextOutputFormat.class); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //blocking collection + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //block sizes + + conf.setMapperClass(preprocessing.BlockSizeCounterMapper.class); + conf.setReducerClass(IdentityReducer.class); + + conf.setNumReduceTasks(1); //to sort & merge output in one file + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} diff --git a/MetaBlocking/src/main/java/preprocessing/BlockSizeCounterMapper.java b/MetaBlocking/src/main/java/preprocessing/BlockSizeCounterMapper.java new file mode 100644 index 0000000..250b1dd --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/BlockSizeCounterMapper.java @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import java.io.IOException; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + + + +public class BlockSizeCounterMapper extends MapReduceBase implements Mapper { + + public enum InputComparisons {CLEAN_CLEAN, DIRTY}; + VIntWritable inverseUtility = new VIntWritable(); + + /** + * input key: block id
+ * input value: all entity ids of this block, separated by ','
+ * output key: cardinality (dirty ER) OR inverseUtility (clean-clean ER)
+ * output value: block id + */ + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + + String valueString = value.toString().replaceFirst(";", ""); + String []entities = valueString.split("#"); + //VIntWritable[] entities = value.get(); + //if (entities.length < 2) {return;} + /*inverseUtility.set(entities.length); //dirty ER (||bk|| > ||bl|| <=> |bk| > |bl|) + + output.collect(inverseUtility, key); //dirty ER +*/ //dirty ER stops here. Whatever follows is only for clean-clean ER... + + + ////////////////////////////////////////////////////////////////////// + // STOP HERE FOR DIRTY ER!!!!! + // START FROM HERE FOR CLEAN-CLEAN ER (after parsing the entities) + ////////////////////////////////////////////////////////////////////// + int D1counter = 0; + + for (String entity : entities) { + Integer entityId = Integer.parseInt(entity); + if (entityId == null) { reporter.setStatus("empty id:"+value); continue; } + if (entityId >= 0) { + D1counter++; + } + } + + int D2counter = entities.length - D1counter; + + reporter.incrCounter(InputComparisons.CLEAN_CLEAN, (long) D1counter * (long) D2counter); + reporter.incrCounter(InputComparisons.DIRTY, (entities.length * (entities.length -1) )/2); + + + //instead of taking utility = 1 / max(|D1|, |D2|), + //take the inverseUtility = max(|D1|, |D2|) and sort in the reverse order (the smaller the better) + inverseUtility.set(Math.max(D1counter, D2counter)); //clean-clean ER + + if (D1counter > 0 && D2counter > 0) { //clean-clean ER + output.collect(inverseUtility, key); + } + } + + +} diff --git a/MetaBlocking/src/main/java/preprocessing/BlocksFromEntityIndexDriver.java b/MetaBlocking/src/main/java/preprocessing/BlocksFromEntityIndexDriver.java new file mode 100644 index 0000000..fcd2804 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/BlocksFromEntityIndexDriver.java @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.OutputStreamWriter; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.Counters; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RunningJob; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +import preprocessing.BlocksFromEntityIndexReducer.OutputData; + + + +public class BlocksFromEntityIndexDriver extends Configured { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(preprocessing.BlocksFromEntityIndexDriver.class); + + conf.setJobName("Blocks from Entity Index (Default Balancing)"); + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(VIntWritable.class); + + conf.setOutputKeyClass(VIntWritable.class); //block id + conf.setOutputValueClass(VIntArrayWritable.class); //list of entities in this block + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //Entity Index (Filtered with block filtering) + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //Blocking Collection (Filtered with block filtering) + + conf.setMapperClass(preprocessing.BlocksFromEntityIndexMapper.class); + conf.setReducerClass(preprocessing.BlocksFromEntityIndexReducer.class); + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + conf.setMaxReduceTaskFailuresPercent(10); + conf.set("mapred.reduce.max.attempts", "10"); + conf.set("mapred.max.tracker.failures", "100"); + conf.set("mapred.job.tracker.handler.count", "40"); + + conf.setNumReduceTasks(223); + + conf.setCompressMapOutput(true); + + client.setConf(conf); + RunningJob job = null; + try { + job = JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + + //the following is used only for CNP,CEPTotalOrder but does not create any overhead (keep it always) + if (job == null) { + System.err.println("No job found"); + return; + } + + BufferedWriter bwClean = null; + BufferedWriter bwDirty = null; + try { + Counters counters = job.getCounters(); + Long dirtyBlocks = counters.findCounter("org.apache.hadoop.mapred.Task$Counter", + "REDUCE_OUTPUT_RECORDS").getCounter(); + Long cleanBlocks = counters.findCounter(OutputData.CLEAN_BLOCKS).getCounter(); + Path cleanPath=new Path("/user/hduser/numBlocksClean.txt"); + Path dirtyPath=new Path("/user/hduser/numBlocksDirty.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + bwClean = new BufferedWriter(new OutputStreamWriter(fs.create(cleanPath,true))); + bwDirty = new BufferedWriter(new OutputStreamWriter(fs.create(dirtyPath,true))); + bwClean.write(cleanBlocks.toString()); + bwDirty.write(dirtyBlocks.toString()); + } catch (IllegalArgumentException | IOException e) { + System.err.println(e.toString()); + } finally { + try { bwClean.close(); bwDirty.close(); } + catch (IOException e) { System.err.println(e.toString());} + } + } + + + +} diff --git a/MetaBlocking/src/main/java/preprocessing/BlocksFromEntityIndexDriverBalancedFixedPartitions.java b/MetaBlocking/src/main/java/preprocessing/BlocksFromEntityIndexDriverBalancedFixedPartitions.java new file mode 100644 index 0000000..ff8eeb1 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/BlocksFromEntityIndexDriverBalancedFixedPartitions.java @@ -0,0 +1,216 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import hadoopUtils.MapSortByValue; +import hadoopUtils.Partition; +import hadoopUtils.PartitionComparator; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.net.URI; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.PriorityQueue; +import java.util.Queue; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.Counters; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RunningJob; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +import preprocessing.BlocksFromEntityIndexReducer.OutputData; + + + +public class BlocksFromEntityIndexDriverBalancedFixedPartitions extends Configured { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(preprocessing.BlocksFromEntityIndexDriverBalancedFixedPartitions.class); + + conf.setJobName("Blocks from Entity Index (Balanced With Fixed Number of Partitions)"); + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(VIntWritable.class); + + conf.setOutputKeyClass(VIntWritable.class); //block id + conf.setOutputValueClass(VIntArrayWritable.class); //list of entities in this block + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //Entity Index (Filtered with block filtering) + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //Blocking Collection (Filtered with block filtering) + + conf.setMapperClass(preprocessing.BlocksFromEntityIndexMapper.class); + conf.setReducerClass(preprocessing.BlocksFromEntityIndexReducer.class); + + conf.setInt("mapred.task.timeout", 10000000); +// conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + conf.setMaxReduceTaskFailuresPercent(10); + conf.set("mapred.reduce.max.attempts", "10"); + conf.set("mapred.max.tracker.failures", "100"); + conf.set("mapred.job.tracker.handler.count", "40"); + + conf.setCompressMapOutput(true); + + + + ////////////////////////////////// + //Here starts the balancing part// + ////////////////////////////////// + long startTime = System.currentTimeMillis(); + + //a block is a map entry with key: blockId, value: #comparisons + Map blocks = new LinkedHashMap<>(); //keeps order of insertion (blocks are already sorted descending) + + try{ + FileSystem fs = FileSystem.get(new Configuration()); + Path pt=new Path("/user/hduser/afterFilteringBlockSizes.txt"); + BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(pt))); + + String line; + while ((line = br.readLine()) != null) { + String[] block = line.split("\t"); + int blockId = Integer.parseInt(block[0]); + long blockComparisons = Long.parseLong(block[1]); //actually the size in bytes of the next mappers' output (already squared) + blocks.put(blockId, blockComparisons); + } + br.close(); + + + }catch(Exception e){ + System.err.println(e.toString()); + } + + //parameters + int numClusterNodes = 14; //default value + final int numSlotsPerNode = 4; + int numReduceRounds = 1; //default value + + if (args.length == 4) { + numClusterNodes = Integer.parseInt(args[2]); + numReduceRounds = Integer.parseInt(args[3]); + } + + final int SLOTS = numClusterNodes * numSlotsPerNode; + + //sort blocks + Map sortedBlocks = MapSortByValue.sortByValue(blocks); //in descending order of size + + final int numPartitions = SLOTS * numReduceRounds; + + //initialize the queue + Queue pq = new PriorityQueue<>(numPartitions, new PartitionComparator()); + for (int i = 0; i < numPartitions; ++i) { //add new empty +partitions + pq.add(new Partition()); + } + + + while (!sortedBlocks.isEmpty()) { + Map.Entry currentBlock = sortedBlocks.entrySet().iterator().next(); + sortedBlocks.remove(currentBlock.getKey()); + + Partition smallestPartition = pq.poll(); + + smallestPartition.addBlock(currentBlock); //add it to the partition + + pq.add(smallestPartition); + + } + + System.out.println("Total partitions\t:\t" + numPartitions); + + try{ + Path pt2=new Path("/user/hduser/blockPartitions.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + BufferedWriter bw=new BufferedWriter(new OutputStreamWriter(fs.create(pt2,true))); + + + + //store partitions from biggest to smallest (ids) + for (int i = numPartitions-1; i >= 0; --i) { + Partition partition = pq.poll(); //the smallest partition + + String paritionId = Integer.toString(i); + + for (Integer blockId : partition.getBlocks().keySet()) { //write the mapping to a file, that will later be added to the DistributedCache + bw.write(Integer.toString(blockId)); + bw.write("\t"); + bw.write(paritionId); + bw.newLine(); + } + + } + bw.close(); + DistributedCache.addCacheFile(new URI(pt2.toString()), conf); + } catch(Exception e){ + System.err.println(e.toString()); + } + + long endTime = System.currentTimeMillis(); //to get wall-clock times, as opposed to nanoTime + long balancingOverhead = endTime - startTime; + System.out.println("Load Balancing overhead: "+balancingOverhead+" ms = "+balancingOverhead/1000.0+"sec = "+balancingOverhead/60000.0+" mins."); + //end of load balancing part + + + conf.setNumReduceTasks(numPartitions); + conf.setPartitionerClass(preprocessing.BlocksFromEntityIndexParitioner.class); + + + client.setConf(conf); + RunningJob job = null; + try { + job = JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + + //the following is used only for CNP,CEPTotalOrder but does not create any overhead (keep it always) + if (job == null) { + System.err.println("No job found"); + return; + } + + BufferedWriter bwClean = null; + BufferedWriter bwDirty = null; + try { + Counters counters = job.getCounters(); + Long dirtyBlocks = counters.findCounter("org.apache.hadoop.mapred.Task$Counter", + "REDUCE_OUTPUT_RECORDS").getCounter(); + Long cleanBlocks = counters.findCounter(OutputData.CLEAN_BLOCKS).getCounter(); + Path cleanPath=new Path("/user/hduser/numBlocksClean.txt"); + Path dirtyPath=new Path("/user/hduser/numBlocksDirty.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + bwClean = new BufferedWriter(new OutputStreamWriter(fs.create(cleanPath,true))); + bwDirty = new BufferedWriter(new OutputStreamWriter(fs.create(dirtyPath,true))); + bwClean.write(cleanBlocks.toString()); + bwDirty.write(dirtyBlocks.toString()); + } catch (IllegalArgumentException | IOException e) { + System.err.println(e.toString()); + } finally { + try { bwClean.close(); bwDirty.close(); } + catch (IOException e) { System.err.println(e.toString());} + } + } + + + +} diff --git a/MetaBlocking/src/main/java/preprocessing/BlocksFromEntityIndexDriverMaxBlock.java b/MetaBlocking/src/main/java/preprocessing/BlocksFromEntityIndexDriverMaxBlock.java new file mode 100644 index 0000000..ad27df6 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/BlocksFromEntityIndexDriverMaxBlock.java @@ -0,0 +1,238 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import hadoopUtils.MapSortByValue; +import hadoopUtils.Partition; +import hadoopUtils.PartitionComparator; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.net.URI; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.PriorityQueue; +import java.util.Queue; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.Counters; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RunningJob; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +import preprocessing.BlocksFromEntityIndexReducer.OutputData; + + + + + +public class BlocksFromEntityIndexDriverMaxBlock extends Configured { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(preprocessing.BlocksFromEntityIndexDriverMaxBlock.class); + + conf.setJobName("Blocks from Entity Index (Balanced)"); + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(VIntWritable.class); + + conf.setOutputKeyClass(VIntWritable.class); //block id + conf.setOutputValueClass(VIntArrayWritable.class); //list of entities in this block + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //Entity Index (Filtered with block filtering) + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //Blocking Collection (Filtered with block filtering) + + conf.setMapperClass(preprocessing.BlocksFromEntityIndexMapper.class); + conf.setReducerClass(preprocessing.BlocksFromEntityIndexReducer.class); + + conf.setInt("mapred.task.timeout", 10000000); +// conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + conf.setMaxReduceTaskFailuresPercent(10); + conf.set("mapred.reduce.max.attempts", "10"); + conf.set("mapred.max.tracker.failures", "100"); + conf.set("mapred.job.tracker.handler.count", "40"); + + conf.setCompressMapOutput(true); + + + + ////////////////////////////////// + //Here starts the balancing part// + ////////////////////////////////// + + + //a block is a map entry with key: blockId, value: #comparisons + Map blocks = new LinkedHashMap<>(); //keeps order of insertion (blocks are already sorted descending) + + try{ + FileSystem fs = FileSystem.get(new Configuration()); + Path pt=new Path("/user/hduser/afterFilteringBlockSizes.txt"); + BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(pt))); + + String line; + while ((line = br.readLine()) != null) { + String[] block = line.split("\t"); + int blockId = Integer.parseInt(block[0]); + long blockComparisons = Long.parseLong(block[1]); //actually the size in bytes of the next mappers' output (already squared) + blocks.put(blockId, blockComparisons); + } + br.close(); + + }catch(Exception e){ + System.err.println(e.toString()); + } + + //parameters + int numClusterNodes = 14; //default value + final int numSlotsPerNode = 4; + int numReduceRounds = 1; //default value + + if (args.length == 5) { + numClusterNodes = Integer.parseInt(args[2]); + numReduceRounds = Integer.parseInt(args[3]); + } + + final int SLOTS = numClusterNodes * numSlotsPerNode; //#nodes * #slots/node + + //one partition for the largest block + Map sortedBlocks = MapSortByValue.sortByValue(blocks); //in descending order of size + Map.Entry largestBlock = sortedBlocks.entrySet().iterator().next(); + sortedBlocks.remove(largestBlock.getKey()); + + int numInitialPartitions = numReduceRounds*SLOTS; + + //initialize the queue + Queue pq = new PriorityQueue<>(numInitialPartitions, new PartitionComparator()); + for (int i = 1; i < numInitialPartitions; ++i) { //add numInitialPartitions new partitions + pq.add(new Partition()); + } + Partition seedPartition = new Partition(); + seedPartition.addBlock(largestBlock); + pq.add(seedPartition); + //maximum comparisons per partition + long partitionComparisons = largestBlock.getValue(); + System.out.println("Partition comparisons\t:\t" + partitionComparisons); + + + while (!sortedBlocks.isEmpty()) { + Map.Entry currentBlock = sortedBlocks.entrySet().iterator().next(); + sortedBlocks.remove(currentBlock.getKey()); + Partition smallestPartition = pq.poll(); + double totalComparisons = smallestPartition.getTotalComparisons() + currentBlock.getValue(); + if (totalComparisons <= partitionComparisons) { // if the new block fits into the smallest partition + smallestPartition.addBlock(currentBlock); //add it to the partition + } else { //otherwise create a new partition for the current block + /* if (SLOTS < sortedBlocks.size()) { //so that no empty slots will be left at the end + for (int i = 1; i < SLOTS; ++i) { //add SLOTS-1 new partitions (so that partitions will be a product of SLOTS + pq.add(new Partition()); + } + }*/ + Partition newPartition = new Partition(); + newPartition.addBlock(currentBlock); + pq.add(newPartition); + } + pq.add(smallestPartition); + + if (sortedBlocks.isEmpty()) { + smallestPartition = pq.poll(); + if (smallestPartition.getTotalComparisons() < 0.9*partitionComparisons && + (int)(0.5*blocks.size()) < smallestPartition.getBlocks().size()) { + sortedBlocks.putAll(smallestPartition.getBlocks()); + partitionComparisons += smallestPartition.getTotalComparisons()/pq.size()+1; + } else { + pq.add(smallestPartition); + } + } + } + + int noOfPartitions = pq.size(); + System.out.println("Total partitions\t:\t" + noOfPartitions); + + try{ + Path pt2=new Path("/user/hduser/blockPartitions.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + BufferedWriter bw=new BufferedWriter(new OutputStreamWriter(fs.create(pt2,true))); + + + + //store partitions from biggest to smallest (ids) + for (int i = noOfPartitions-1; i >= 0; --i) { + Partition partition = pq.poll(); //the smallest partition + + String paritionId = Integer.toString(i); + + for (Integer blockId : partition.getBlocks().keySet()) { //write the mapping to a file, that will later be added to the DistributedCache + bw.write(Integer.toString(blockId)); + bw.write("\t"); + bw.write(paritionId); + bw.newLine(); + } + + } + bw.close(); + DistributedCache.addCacheFile(new URI(pt2.toString()), conf); + } catch(Exception e){ + System.err.println(e.toString()); + } + + + conf.setNumReduceTasks(noOfPartitions); + conf.setPartitionerClass(preprocessing.BlocksFromEntityIndexParitioner.class); + + + client.setConf(conf); + RunningJob job = null; + try { + job = JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + + //the following is used only for CNP,CEPTotalOrder but does not create any overhead (keep it always) + if (job == null) { + System.err.println("No job found"); + return; + } + + BufferedWriter bwClean = null; + BufferedWriter bwDirty = null; + try { + Counters counters = job.getCounters(); + Long dirtyBlocks = counters.findCounter("org.apache.hadoop.mapred.Task$Counter", + "REDUCE_OUTPUT_RECORDS").getCounter(); + Long cleanBlocks = counters.findCounter(OutputData.CLEAN_BLOCKS).getCounter(); + Path cleanPath=new Path("/user/hduser/numBlocksClean.txt"); + Path dirtyPath=new Path("/user/hduser/numBlocksDirty.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + bwClean = new BufferedWriter(new OutputStreamWriter(fs.create(cleanPath,true))); + bwDirty = new BufferedWriter(new OutputStreamWriter(fs.create(dirtyPath,true))); + bwClean.write(cleanBlocks.toString()); + bwDirty.write(dirtyBlocks.toString()); + } catch (IllegalArgumentException | IOException e) { + System.err.println(e.toString()); + } finally { + try { bwClean.close(); bwDirty.close(); } + catch (IOException e) { System.err.println(e.toString());} + } + } + +} diff --git a/MetaBlocking/src/main/java/preprocessing/BlocksFromEntityIndexMapper.java b/MetaBlocking/src/main/java/preprocessing/BlocksFromEntityIndexMapper.java new file mode 100644 index 0000000..c8fb005 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/BlocksFromEntityIndexMapper.java @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import java.io.IOException; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + + + + +public class BlocksFromEntityIndexMapper extends MapReduceBase implements Mapper { + + /** + * maps an input entity index into (key, value) pair(s) + * the value is the entity id (input key) along with the ids of blocks that contain it + * the key each time is a block id (each element of the input value array) + * @param key an entity id + * @param value an array of block ids that this entity belongs to + * @param output key: a block id (each element of the input value array) - value: an entity id of this block (the input key) + */ + public void map(VIntWritable key, VIntArrayWritable value, + OutputCollector output, Reporter reporter) throws IOException { + + VIntWritable [] Bi = value.get(); //the blocks of this entity + + for (VIntWritable bi : Bi) { + output.collect(bi, key); + } + + } + +} diff --git a/MetaBlocking/src/main/java/preprocessing/BlocksFromEntityIndexParitioner.java b/MetaBlocking/src/main/java/preprocessing/BlocksFromEntityIndexParitioner.java new file mode 100644 index 0000000..a197d8b --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/BlocksFromEntityIndexParitioner.java @@ -0,0 +1,93 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.Partitioner; + +public class BlocksFromEntityIndexParitioner implements Partitioner{ + + //private final int MAX_BLOCK_ID = 1499534; //the number of lines in BlockSizes/part-00000 file + + Map blockPartitions; + Path[] localFiles; + + @Override + public void configure(JobConf job) { + blockPartitions = new HashMap<>(); + + BufferedReader SW; + try { + localFiles = DistributedCache.getLocalCacheFiles(job); + SW = new BufferedReader(new FileReader(localFiles[0].toString())); + String line; + while ((line = SW.readLine()) != null) { + if (line.trim().isEmpty()) {break;} + String[] block = line.split("\t"); + blockPartitions.put(Integer.parseInt(block[0]), Integer.parseInt(block[1])); + } + SW.close(); + } catch (FileNotFoundException e) { + System.err.println(e.toString()); + } catch (IOException e) { + System.err.println(e.toString()); + } + } + + //option 4 + @Override + public int getPartition(VIntWritable key, VIntWritable value, int numPartitions) { + int blockId = key.get(); + return blockPartitions.get(blockId); + } + +// //option 3 +// @Override +// public int getPartition(VIntWritable key, Text value, int numPartitions) { +// int blockId = key.get(); +// int inverseBlockId = MAX_BLOCK_ID - blockId; //the largest block is the one with MAX_BLOCK_ID +// if (inverseBlockId / numPartitions == 0) {// η πρώτη N-άδα +// return inverseBlockId % numPartitions; +// }else { +// return numPartitions-1-inverseBlockId%numPartitions; +// } +// } + + //option 2 +// @Override +// public int getPartition(VIntWritable key, Text value, int numPartitions) { +// int blockId = key.get(); +// int inverseBlockId = MAX_BLOCK_ID - blockId; //the largest block is the one with MAX_BLOCK_ID +// if (inverseBlockId / numPartitions % 2 == 0) {// περιττή Ν-άδα +// return inverseBlockId % numPartitions; +// }else { +// return numPartitions-1-inverseBlockId%numPartitions; +// } +// } + + //option 1 +// @Override +// public int getPartition(VIntWritable key, Text value, int numPartitions) { +// int blockId = key.get(); +// return (MAX_BLOCK_ID - blockId) % numPartitions; +// } + + + + + + + +} diff --git a/MetaBlocking/src/main/java/preprocessing/BlocksFromEntityIndexReducer.java b/MetaBlocking/src/main/java/preprocessing/BlocksFromEntityIndexReducer.java new file mode 100644 index 0000000..815a572 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/BlocksFromEntityIndexReducer.java @@ -0,0 +1,64 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; + +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + + +public class BlocksFromEntityIndexReducer extends MapReduceBase implements Reducer { + + static enum OutputData {CLEAN_BLOCKS, COMPARISONS}; + + /** + * @param _key a block id (each element of the input value array) - + * @param values a list of entity ids i) + * @param output key: block id (same as input key). value: a concatenation of input int arrays + * + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + long negatives = 0; + List inputList = new ArrayList<>(); + boolean atLeastTwoEntities = false; + boolean containsNegative = false; + boolean containsPositive = false; + while (values.hasNext()) { + if (inputList.size() > 1) { + atLeastTwoEntities = true; + } + int next = values.next().get(); + inputList.add(new VIntWritable(next)); + + if (next < 0) { + containsNegative = true; + negatives++; + } else { + containsPositive = true; + } + } + if (atLeastTwoEntities) { //else purge this block + VIntWritable[] tmpArray = new VIntWritable[inputList.size()]; + +// output.collect(_key, new VIntArrayWritable(inputList.toArray(tmpArray))); //dirty + if (containsNegative && containsPositive) { //a valid block for clean-clean ER + output.collect(_key, new VIntArrayWritable(inputList.toArray(tmpArray))); //clean-clean + reporter.incrCounter(OutputData.CLEAN_BLOCKS, 1); + reporter.incrCounter(OutputData.COMPARISONS, negatives*(tmpArray.length-negatives)); + } //DIRTY_BLOCKS = REDUCE_OUTPUT_RECORDS + } //PURGED_BLOCKS = REDUCE_INPUT_GROUPS - REDUCE_OUTPUT_RECORDS + } + +} diff --git a/MetaBlocking/src/main/java/preprocessing/BlocksPerEntity.java b/MetaBlocking/src/main/java/preprocessing/BlocksPerEntity.java new file mode 100644 index 0000000..05eaaec --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/BlocksPerEntity.java @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import java.io.IOException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FileUtil; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.RunningJob; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.TextOutputFormat; + +import preprocessing.VIntArrayWritable; + + + +class BlocksPerEntityMapper extends MapReduceBase implements Mapper { + + /** + * + * @param key an entity id + * @param value an array of block ids that this entity belongs to + * @param output key: the input key - value: the number of blocks in the input value + */ + public void map(VIntWritable key, VIntArrayWritable value, + OutputCollector output, Reporter reporter) throws IOException { + output.collect(key, new VIntWritable(value.get().length)); + } + +} + +public class BlocksPerEntity extends Configured { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(preprocessing.BlocksPerEntity.class); + + conf.setJobName("Number of blocks per entity (from EI)"); + + conf.setOutputKeyClass(VIntWritable.class); //entity id + conf.setOutputValueClass(VIntWritable.class); //number of blocks (after block filtering) + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(TextOutputFormat.class); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //Entity Index (Filtered with block filtering) + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //BlocksPerEntity + + conf.setMapperClass(BlocksPerEntityMapper.class); + conf.setNumReduceTasks(0); + + client.setConf(conf); + RunningJob job = null; + try { + job = JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + if (job == null) { + System.err.println("No job found"); + return; + } + + if (args[1].endsWith("/")) { + args[1] = args[1].substring(0, args[1].length()-1); + } + String mergedFile = args[1]+".txt"; + Configuration configuration = new Configuration(); + try { + FileSystem hdfs = FileSystem.get(configuration); + FileUtil.copyMerge(hdfs, new Path(args[1]), hdfs, new Path(mergedFile), false, configuration, null); + } catch (IOException e) { + System.err.println(e); + } + + } + + + +} diff --git a/MetaBlocking/src/main/java/preprocessing/EJSDriver.java b/MetaBlocking/src/main/java/preprocessing/EJSDriver.java new file mode 100644 index 0000000..a8077e9 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/EJSDriver.java @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.OutputStreamWriter; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.Counters; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RunningJob; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +import preprocessing.EJSMapper.OutputData; + + +public class EJSDriver extends Configured { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(preprocessing.EJSDriver.class); + + conf.setJobName("EJS preprocessing from Extended Input"); + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(Text.class); + + conf.setOutputKeyClass(VIntWritable.class); + conf.setOutputValueClass(VIntArrayWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //ExtendedInput + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //EJS intermediate + + conf.setMapperClass(preprocessing.EJSMapper.class); + conf.setReducerClass(preprocessing.EJSReducer.class); + + //conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + + conf.setNumReduceTasks(336); + + conf.setCompressMapOutput(true); + + client.setConf(conf); + RunningJob job = null; + try { + job = JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + + //the following is used only for CNP,CEPTotalOrder but does not create any overhead (keep it always) + if (job == null) { + System.err.println("No job found"); + return; + } + + try { + Counters counters = job.getCounters(); + Long validComparisons = counters.findCounter(OutputData.VALID_COMPARISONS_X2).getCounter()/2; + Path pt=new Path("/user/hduser/validComparisons.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + BufferedWriter br=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true))); + br.write(validComparisons.toString()); + br.close(); + } catch (IllegalArgumentException | IOException e) { + System.err.println(e.toString()); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/preprocessing/EJSFinalDriver.java b/MetaBlocking/src/main/java/preprocessing/EJSFinalDriver.java new file mode 100644 index 0000000..259c994 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/EJSFinalDriver.java @@ -0,0 +1,65 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; +import org.apache.hadoop.mapred.lib.IdentityMapper; + + +public class EJSFinalDriver extends Configured { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(preprocessing.EJSFinalDriver.class); + + conf.setJobName("Final Preprocessing for EJS"); + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(VIntArrayWritable.class); + + conf.setOutputKeyClass(VIntWritable.class); //block id + conf.setOutputValueClass(Text.class); //list of entities in this block, along with their other blocks and their #comparisons + + conf.setInputFormat(SequenceFileInputFormat.class); + //conf.setOutputFormat(TextOutputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //EJS intermediate + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //extended input file (blocking collection with entity index and cardinality) + + conf.setMapperClass(IdentityMapper.class); //forward input to reducer + conf.setReducerClass(preprocessing.ExtendedInputReducer.class); //just concat the input intarrays, e.g. [][][]...[] + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + conf.setMaxReduceTaskFailuresPercent(10); //acceptable failures before the whole job fails + conf.set("mapred.reduce.max.attempts", "10"); //before it is not started again + conf.set("mapred.max.tracker.failures", "100"); //before it gets black-listed + conf.set("mapred.job.tracker.handler.count", "40"); + + conf.setNumReduceTasks(224); + + client.setConf(conf); + + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + + + +} diff --git a/MetaBlocking/src/main/java/preprocessing/EJSMapper.java b/MetaBlocking/src/main/java/preprocessing/EJSMapper.java new file mode 100644 index 0000000..dc08c56 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/EJSMapper.java @@ -0,0 +1,164 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import hadoopUtils.MBTools; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +import advanced.AverageWeightMapper.OutputData; +import advanced.AverageWeightMapper.Weight; + +public class EJSMapper extends MapReduceBase implements Mapper { + + + public enum OutputData {PURGED_BLOCKS, REMOVED_ENTITIES, VALID_COMPARISONS_X2}; + + private VIntWritable ei = new VIntWritable(); + Text biComparisonsEi = new Text(); + + + /** + * input: an extended blocking collection + * @param key block id + * @param value arrays of entity ids in this block (first element), along with the block ids (sorted) that contain them (remaining elements) + * e.g. [1,7,8,9][3,1,8,10] means that in this block belong the entities 1 and 3 and entity 1 is placed in blocks 7,8,9 (sorted) and + * entity 3 is placed in blocks 1,8,10 + * @param output key: entity id (each of the input values). value: inputKey,#non-redundant comparisons for ei in this block + */ + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + reporter.setStatus("splitting the block "+key); + + Map> entityIndex = new TreeMap<>(); //key is entity id, value is the list of blocks that contain the key + List blocks; + String[] entityIndices = value.toString().split("]"); //each entityIndex is an array with the first element the entity and the rest elements its blocks + for (String tmpEntityIndex : entityIndices) { + if (tmpEntityIndex == null || tmpEntityIndex.length() < 2) {continue;} + tmpEntityIndex = tmpEntityIndex.substring(1); //to remove the initial '[' + String[] idsArray = tmpEntityIndex.split(", "); + int entityId = Integer.parseInt(idsArray[0]); + blocks = new ArrayList<>(idsArray.length-1); //maybe initial capacity is not needed + for (int i=1; i < idsArray.length; ++i) { + blocks.add(Integer.parseInt(idsArray[i])); + } + entityIndex.put(entityId, blocks); + } + + /*//clean-clean ER + List D1entities = new ArrayList<>(); + List D2entities = new ArrayList<>(); + for (int entity : entityIndex.keySet()) { + if (entity < 0) { + D2entities.add(entity); + } else { + D1entities.add(entity); + } + } + if (D1entities.isEmpty() || D2entities.isEmpty()) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + }*/ + + //dirty ER + List entities = new ArrayList<>(entityIndex.keySet()); + if (entities.size() < 2) { + reporter.incrCounter(OutputData.PURGED_BLOCKS, 1); + return; + } + + //clean-clean ER + /*int blockId = key.get(); + List blockse1; + List blockse2; + int counter = 0; + int D1size = D1entities.size(); + //TODO: add formatting, to skip many decimal digits in weight string + + Map entityComparisons = new HashMap<>(); + + for (int e1 : D1entities) { + reporter.setStatus(++counter+"/"+D1size); + blockse1 = entityIndex.get(e1); + for (int e2 : D2entities) { + blockse2 = entityIndex.get(e2); + if (!MBTools.isRepeated(blockse1, blockse2, blockId)) { + Integer previous = entityComparisons.get(e1); + Integer newCount = previous == null ? 1 : previous+1; + entityComparisons.put(e1, newCount); + + previous = entityComparisons.get(e2); + newCount = previous == null ? 1 : previous+1; + entityComparisons.put(e2, newCount); + } + } + } + + reporter.incrCounter(OutputData.REMOVED_ENTITIES, D1entities.size()+D2entities.size()- entityComparisons.keySet().size()); + + */ + + + //dirty ER + int blockId = key.get(); + List blockse1; + List blockse2; + int counter = 0; + Integer []entitiesArray = new Integer[entities.size()]; + entitiesArray = entities.toArray(entitiesArray); + int blockSize = entitiesArray.length; + + Map entityComparisons = new HashMap<>(); + + for (int i = 0; i < blockSize-1; ++i) { + int e1 = entitiesArray[i]; + reporter.setStatus(++counter+"/"+blockSize); + blockse1 = entityIndex.get(e1); + for (int j = i+1; j < blockSize; ++j) { + int e2 = entitiesArray[j]; + blockse2 = entityIndex.get(e2); + if (!MBTools.isRepeated(blockse1, blockse2, blockId)) { + Integer previous = entityComparisons.get(e1); + Integer newCount = previous == null ? 1 : previous+1; + entityComparisons.put(e1, newCount); + + previous = entityComparisons.get(e2); + newCount = previous == null ? 1 : previous+1; + entityComparisons.put(e2, newCount); + } + } + } + + reporter.incrCounter(OutputData.REMOVED_ENTITIES, entities.size()- entityComparisons.keySet().size()); + + + + + //common for both dirty and clean-clean + for (Map.Entry entity : entityComparisons.entrySet()) { + Integer comparisons = entity.getValue(); + //if (comparisons == null) { continue; } //no non-redundant comparisons for this entity + ei.set(entity.getKey()); + biComparisonsEi.set(key+","+comparisons); + output.collect(ei, biComparisonsEi); + reporter.incrCounter(OutputData.VALID_COMPARISONS_X2, comparisons); + } + + + + } + + +} diff --git a/MetaBlocking/src/main/java/preprocessing/EJSReducer.java b/MetaBlocking/src/main/java/preprocessing/EJSReducer.java new file mode 100644 index 0000000..c9edc5d --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/EJSReducer.java @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +public class EJSReducer extends MapReduceBase implements Reducer { + + VIntArrayWritable toEmit = new VIntArrayWritable(); + + /** + * @param _key entityId + * @param values a list of concatenations of blockId,#distinctComparisonsOfentityId in this block + * @param output key: blockId for each blockId in the values. value: [entityId,blockIds,...,#totalComparisonsOfEid] + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + List blocks = new ArrayList<>(); + int comparisons = 0; + while (values.hasNext()) { + String[] value = values.next().toString().split(","); + blocks.add(new VIntWritable(Integer.parseInt(value[0]))); + comparisons += Integer.parseInt(value[1]); + } + + VIntWritable[] toEmitArray = new VIntWritable[blocks.size()+2]; + toEmitArray[0] = _key; //first index->Eid + System.arraycopy(blocks.toArray(), 0, toEmitArray, 1, blocks.size()); //middle ->block + toEmitArray[blocks.size()+1] = new VIntWritable(comparisons); //last index->#comparisons + + toEmit.set(toEmitArray); + + for (VIntWritable blockId : blocks) { + output.collect(blockId, toEmit); + } + + + } + +} diff --git a/MetaBlocking/src/main/java/preprocessing/EntityIdsToIntDriver.java b/MetaBlocking/src/main/java/preprocessing/EntityIdsToIntDriver.java new file mode 100644 index 0000000..105504e --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/EntityIdsToIntDriver.java @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VLongWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + + + +public class EntityIdsToIntDriver { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(preprocessing.EntityIdsToIntDriver.class); + + conf.setJobName("New Ids for entities"); + + conf.setOutputKeyClass(VLongWritable.class); + conf.setOutputValueClass(Text.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //entity collection + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //entity collection with new int ids + + conf.setMapperClass(preprocessing.EntityIdsToIntMapper.class); + //conf.setReducerClass(IdentityReducer.class); //just the first time for debugging (counter) + + conf.setNumReduceTasks(160); //to merge & sort output in one file + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} diff --git a/MetaBlocking/src/main/java/preprocessing/EntityIdsToIntMapper.java b/MetaBlocking/src/main/java/preprocessing/EntityIdsToIntMapper.java new file mode 100644 index 0000000..61bfc91 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/EntityIdsToIntMapper.java @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import java.io.IOException; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VLongWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +public class EntityIdsToIntMapper extends MapReduceBase implements Mapper { + + static enum InputData {MALFORMED_PAIRS, DBPEDIA, DATASOURCE2, WRAPAROUND}; + + private int currentMapper; + private int counter; + public void configure(JobConf conf) { + currentMapper = Integer.parseInt(conf.get("mapred.task.partition")); + counter = 0; //how many ids in this mapper? + } + + /** + * creates a new int key for each string entity id (initially a uri) + * apply cantor's pair function to create a unique id, based on + * the current mapper's id (number 1) and the current counter of the entity within the mapper (number 2) + * non-dbpedia get negative id + * @param key the entity id + * @param value the attribute-value pairs of the entity,separated by "###" + * @param output key: a new entity id, value: the same as the input value + * + */ + public void map(Text key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + + String[] elements = value.toString().split("###"); //att-value pairs + reporter.progress(); + + if (elements.length < 2 || elements.length % 2 != 0) { + reporter.incrCounter(InputData.MALFORMED_PAIRS, 1); + System.out.println("Malformed: "+elements); + return; + } + counter++; + //use cantor's pairing function from currentMapper (number1) and local counter (number2) + long newId = (((currentMapper+counter)*((long)(currentMapper+counter+1)))/2) + counter; + if (newId < 0) { //in case the int is wrapped around + newId = -newId; + reporter.incrCounter(InputData.WRAPAROUND, 1); + } + if (key.charAt(0)!= '0') { //dbpedia entities start with 0;;;DBpediaURI + reporter.incrCounter(InputData.DATASOURCE2, 1); + newId = -newId; //assign negative ids to non-dbpedia datasources + } else { + reporter.incrCounter(InputData.DBPEDIA, 1); + } + output.collect(new VLongWritable(newId), value); + } + +} diff --git a/MetaBlocking/src/main/java/preprocessing/EntityIndexDriver.java b/MetaBlocking/src/main/java/preprocessing/EntityIndexDriver.java new file mode 100644 index 0000000..e52a572 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/EntityIndexDriver.java @@ -0,0 +1,106 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + + +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.net.URI; +import java.net.URISyntaxException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.Counters; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RunningJob; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +import preprocessing.EntityIndexReducer.OutputData; + + + +public class EntityIndexDriver { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(preprocessing.EntityIndexDriver.class); + + conf.setJobName("Entity Index (With Block Filtering)"); + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(VIntWritable.class); + + conf.setOutputKeyClass(VIntWritable.class); + conf.setOutputValueClass(VIntArrayWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + //conf.setOutputFormat(TextOutputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //blocking collection + FileOutputFormat.setOutputPath(conf, new Path(args[2])); //entity index + + conf.setMapperClass(preprocessing.EntityIndexMapper.class); + conf.setReducerClass(preprocessing.EntityIndexReducer.class); + //conf.setReducerClass(preprocessing.EntityIndexReducerNoFiltering.class); + + conf.setNumReduceTasks(56); + + conf.setInt("mapred.task.timeout", 10000000); + + try { + DistributedCache.addCacheFile(new URI(args[1]+"/part-00000"), conf); // block sizes + } catch (URISyntaxException e1) { + System.err.println(e1.toString()); + } + + client.setConf(conf); + RunningJob job = null; + try { + job = JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + + //the following is used only for CNP,CEPTotalOrder but does not create any overhead (keep it always) + if (job == null) { + System.err.println("No job found"); + return; + } + + try { + Counters counters = job.getCounters(); + Long entities = counters.findCounter("org.apache.hadoop.mapred.Task$Counter", + "REDUCE_OUTPUT_RECORDS").getCounter(); + long blockAssignments = counters.findCounter(OutputData.BLOCK_ASSIGNMENTS).getCounter(); + Float BCin = blockAssignments / (float) entities; + Integer K = ((Double)Math.floor(blockAssignments / 2.0)).intValue(); + Path pt=new Path("/user/hduser/BCin.txt"); + Path k=new Path("/user/hduser/CEPk.txt"); //not tested + Path N=new Path("/user/hduser/numEntities.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + BufferedWriter br=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true))); + BufferedWriter br2=new BufferedWriter(new OutputStreamWriter(fs.create(k,true))); + BufferedWriter br3=new BufferedWriter(new OutputStreamWriter(fs.create(N,true))); + br.write(BCin.toString()); + br2.write(K.toString()); + br3.write(entities.toString()); + br.close(); + br2.close(); + br3.close(); + } catch (IllegalArgumentException | IOException e) { + System.err.println(e.toString()); + } + } + +} \ No newline at end of file diff --git a/MetaBlocking/src/main/java/preprocessing/EntityIndexDriverARCS.java b/MetaBlocking/src/main/java/preprocessing/EntityIndexDriverARCS.java new file mode 100644 index 0000000..360efd0 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/EntityIndexDriverARCS.java @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + + +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.net.URI; +import java.net.URISyntaxException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.Counters; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RunningJob; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +import preprocessing.EntityIndexReducerARCS.OutputData; + +public class EntityIndexDriverARCS { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(preprocessing.EntityIndexDriverARCS.class); + + conf.setJobName("Entity Index (With Block Filtering) for ARCS"); + + /*conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(VIntArrayWritable.class);*/ + + conf.setOutputKeyClass(VIntWritable.class); + conf.setOutputValueClass(VIntArrayWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + //conf.setOutputFormat(TextOutputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //blocking collection + FileOutputFormat.setOutputPath(conf, new Path(args[2])); //entity index + + conf.setMapperClass(preprocessing.EntityIndexMapperARCS.class); + conf.setReducerClass(preprocessing.EntityIndexReducerARCS.class); + //conf.setReducerClass(preprocessing.EntityIndexReducerNoFiltering.class); + + conf.setNumReduceTasks(56); + + try { + DistributedCache.addCacheFile(new URI(args[1]+"/part-00000"), conf); // block sizes + } catch (URISyntaxException e1) { + System.err.println(e1.toString()); + } + + client.setConf(conf); + RunningJob job = null; + try { + job = JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + + //the following is used only for CNP,CEPTotalOrder but does not create any overhead (keep it always) + if (job == null) { + System.err.println("No job found"); + return; + } + + try { + Counters counters = job.getCounters(); + long entities = counters.findCounter("org.apache.hadoop.mapred.Task$Counter", + "REDUCE_OUTPUT_RECORDS").getCounter(); + long blockAssignments = counters.findCounter(OutputData.BLOCK_ASSIGNMENTS).getCounter(); + Float BCin = blockAssignments / (float) entities; + Integer K = ((Double)Math.floor(blockAssignments / 2.0)).intValue(); + Path pt=new Path("/user/hduser/BCin.txt"); + Path k=new Path("/user/hduser/CEPk.txt"); //not tested + FileSystem fs = FileSystem.get(new Configuration()); + BufferedWriter br=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true))); + BufferedWriter br2=new BufferedWriter(new OutputStreamWriter(fs.create(k,true))); + br.write(BCin.toString()); + br2.write(K.toString()); + br.close(); + br2.close(); + } catch (IllegalArgumentException | IOException e) { + System.err.println(e.toString()); + } + } + +} \ No newline at end of file diff --git a/MetaBlocking/src/main/java/preprocessing/EntityIndexMapper.java b/MetaBlocking/src/main/java/preprocessing/EntityIndexMapper.java new file mode 100644 index 0000000..aca6fbb --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/EntityIndexMapper.java @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import java.io.IOException; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +public class EntityIndexMapper extends MapReduceBase implements Mapper { + + VIntWritable entityId = new VIntWritable(); + /** + * input: a blocking collection + * input key: block id + * input value: entity ids in this block, separated by "#" + * output key: entity id (each of the input values) + * output value: block id (the same as the input key) + */ + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + +// String valueString = value.toString().replaceFirst(";", ""); //clean +// String []entities = valueString.split("#"); //clean + String []entities = value.toString().split("#"); //dirty +// VIntWritable[] entities = value.get(); + + for (String entity : entities) { + entityId.set(Integer.parseInt(entity)); +// if (entity == null) { continue; } + output.collect(entityId, key); + } + } + + +} diff --git a/MetaBlocking/src/main/java/preprocessing/EntityIndexMapperARCS.java b/MetaBlocking/src/main/java/preprocessing/EntityIndexMapperARCS.java new file mode 100644 index 0000000..e22de35 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/EntityIndexMapperARCS.java @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import java.io.IOException; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +public class EntityIndexMapperARCS extends MapReduceBase implements Mapper { + + VIntWritable entityId = new VIntWritable(); + VIntWritable blockSize = new VIntWritable(); + VIntWritable[] compositeValueComponents = new VIntWritable[2]; + VIntArrayWritable compositeValue = new VIntArrayWritable(); + /** + * input: a blocking collection + * @param inputKey block id + * @param value entity ids in this block, separated by "#" + * @param output + * key: entity id (each of the input values) + * value: [blockId (i.e. inputKey), blockSize] (for ARCS). + */ + public void map(VIntWritable inputKey, Text value, + OutputCollector output, Reporter reporter) throws IOException { + + // String valueString = value.toString().replaceFirst(";", ""); + String []entities = value.toString().split("#"); +// VIntWritable[] entities = value.get(); + blockSize.set(entities.length); + compositeValueComponents[0] = inputKey; + compositeValueComponents[1] = blockSize; + + for (String entity : entities) { + entityId.set(Integer.parseInt(entity)); + if (entityId == null) { continue; } + compositeValue.set(compositeValueComponents); + output.collect(entityId, compositeValue); + } + } + + +} diff --git a/MetaBlocking/src/main/java/preprocessing/EntityIndexReducer.java b/MetaBlocking/src/main/java/preprocessing/EntityIndexReducer.java new file mode 100644 index 0000000..4087bcf --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/EntityIndexReducer.java @@ -0,0 +1,113 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +public class EntityIndexReducer extends MapReduceBase implements Reducer { + + + static enum OutputData {D1Entities, D2Entities, BLOCK_ASSIGNMENTS, TEST}; + + Map blockUtils; //(blockId, rank) - rank is based on utility + private Path[] localFiles; + + + public void configure(JobConf job){ + + blockUtils = new HashMap<>(); + int blockRank = 0; //based on its relative position in the sorted list by utility + + BufferedReader SW; + try { + localFiles = DistributedCache.getLocalCacheFiles(job); //blocks sorted by utility (descending) + SW = new BufferedReader(new FileReader(localFiles[0].toString())); + String line; + while ((line = SW.readLine()) != null) { + Integer block = Integer.parseInt(line.substring(line.indexOf("\t")+1)); //line has the form: utility+"\t"+blockId + blockUtils.put(block, blockRank++); //blocks are already sorted + } + SW.close(); + } catch (FileNotFoundException e) { + System.err.println(e.toString()); + } catch (IOException e) { + System.err.println(e.toString()); + } + } + + + /** + * Builds the Entity Index, after performing Block Filtering + * To skip the block filtering part, just output all the blocks and not the top MAX_BLOCKS + * by commenting out the specified line + * @param _key entity id + * @param values block ids of the current entity + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + //store the blocks of this entity in ascending order of utility rank + //since each block has a unique rank, this rank can be used as a new block id + Set blocks = new TreeSet<>(); //TreeSet keeps the Set sorted (ascending) + + while (values.hasNext()) { + Integer block = values.next().get(); //the block id + Integer rank = blockUtils.get(block); //the global rank of this block, based on its utility + if (rank != null) { + blocks.add(rank); //store the block using its rank as an id + } + } + + + //local threshold for block filtering + //final int MAX_BLOCKS = ((Double)Math.floor(blocks.size()/3+1)).intValue(); //|_ |Bi|/3+1 _| //graph-free + final int MAX_BLOCKS = ((Double)Math.floor(3*blocks.size()/4+1)).intValue(); //|_ 3|Bi|/4+1 _| //preprocessing + //final int MAX_BLOCKS = ((Double)Math.floor(3*blocks.size()/4)).intValue(); //|_ 3|Bi|/4+1 _| //preprocessing + + Set toEmit = new TreeSet<>(); + + int indexedBlocks = 0; + for (Integer block : blocks) { //returned in ascending order of rank (highest utility->rank 0) + toEmit.add(new VIntWritable(block)); + if (++indexedBlocks == MAX_BLOCKS) { break;} //comment-out this line to skip block filtering + } + + //transform the set to an array, which will be the final output (toEmitFinal) + VIntWritable[] toEmitArray = new VIntWritable[toEmit.size()]; + toEmitArray = toEmit.toArray(toEmitArray); + VIntArrayWritable toEmitFinal = new VIntArrayWritable(toEmitArray); + + //VIntArrayWritable toEmitFinal = hadoopUtils.RelativePositionCompression.compress(toEmit); + + if (indexedBlocks > 0) { + if (_key.get() >= 0) { + reporter.incrCounter(OutputData.D1Entities, 1); + } else { + reporter.incrCounter(OutputData.D2Entities, 1); + } + output.collect(_key, toEmitFinal); + reporter.incrCounter(OutputData.BLOCK_ASSIGNMENTS, toEmit.size()); + //BC = BLOCK_ASSIGNMENTS / REDUCE_OUTPUT_RECORDS; + } //else skip this entity (it is not placed in any block) + } + + +} diff --git a/MetaBlocking/src/main/java/preprocessing/EntityIndexReducerARCS.java b/MetaBlocking/src/main/java/preprocessing/EntityIndexReducerARCS.java new file mode 100644 index 0000000..55de10d --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/EntityIndexReducerARCS.java @@ -0,0 +1,120 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import java.util.TreeSet; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +public class EntityIndexReducerARCS extends MapReduceBase implements Reducer { + + + static enum OutputData {D1Entities, D2Entities, BLOCK_ASSIGNMENTS, TEST}; + + Map blockUtils; //(blockId, rank) - rank is based on utility + private Path[] localFiles; + + + public void configure(JobConf job){ + + blockUtils = new HashMap<>(); + int blockRank = 0; //based on its relative position in the sorted list by utility + + BufferedReader SW; + try { + localFiles = DistributedCache.getLocalCacheFiles(job); //blocks sorted by utility (descending) + SW = new BufferedReader(new FileReader(localFiles[0].toString())); + String line; + while ((line = SW.readLine()) != null) { + Integer block = Integer.parseInt(line.substring(line.indexOf("\t")+1)); //line has the form: utility+"\t"+blockId + blockUtils.put(block, blockRank++); //blocks are already sorted + } + SW.close(); + } catch (FileNotFoundException e) { + System.err.println(e.toString()); + } catch (IOException e) { + System.err.println(e.toString()); + } + } + + + /** + * Builds the Entity Index, after performing Block Filtering + * To skip the block filtering part, just output all the blocks and not the top MAX_BLOCKS + * by commenting out the specified line + * @param _key entity id + * @param values list of [blockId,blockSize] of the current entity + * @param output key: same as input key. value: [blockId,blockSize,blockId,blockSize,...] for retained blocks + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + //store the blocks of this entity in ascending order of utility rank + //since each block has a unique rank, this rank can be used as a new block id + Map blocks = new TreeMap<>(); //TreeSet keeps the Set sorted (ascending) + + while (values.hasNext()) { + VIntArrayWritable value = values.next(); + Integer block = value.get()[0].get(); //the block id + Integer rank = blockUtils.get(block); //the global rank of this block, based on its utility + if (rank != null) { + blocks.put(rank,value.get()[1].get()); //store the block using its rank as an id (key of map) and its size as the value of map + } + } + + + //local threshold for block filtering + //final int MAX_BLOCKS = ((Double)Math.floor(blocks.size()/3+1)).intValue(); //|_ |Bi|/3+1 _| //graph-free + final int MAX_BLOCKS = ((Double)Math.floor(3*blocks.size()/4+1)).intValue(); //|_ 3|Bi|/4+1 _| //preprocessing +// final int MAX_BLOCKS = ((Double)Math.floor(3*blocks.size()/4)).intValue(); //|_ 3|Bi|/4+1 _| //preprocessing + + Map toEmit = new TreeMap<>(); + + int indexedBlocks = 0; + for (Map.Entry block : blocks.entrySet()) { //returned in ascending order of rank (highest utility->rank 0) + toEmit.put(new VIntWritable(block.getKey()), new VIntWritable(block.getValue())); + if (++indexedBlocks == MAX_BLOCKS) { break;} //comment-out this line to skip block filtering + } + + //transform the set to an array, which will be the final output (toEmitFinal) + VIntWritable[] toEmitArray = new VIntWritable[toEmit.size()*2]; + int index = 0; + for (Map.Entry block : toEmit.entrySet()) { //returned in ascending order of rank (highest utility->rank 0) + toEmitArray[index++] = block.getKey(); + toEmitArray[index++] = block.getValue(); + } + VIntArrayWritable toEmitFinal = new VIntArrayWritable(toEmitArray); + + //VIntArrayWritable toEmitFinal = hadoopUtils.RelativePositionCompression.compress(toEmit); + + if (indexedBlocks > 0) { + if (_key.get() >= 0) { + reporter.incrCounter(OutputData.D1Entities, 1); + } else { + reporter.incrCounter(OutputData.D2Entities, 1); + } + output.collect(_key, toEmitFinal); + reporter.incrCounter(OutputData.BLOCK_ASSIGNMENTS, toEmit.size()); + //BC = BLOCK_ASSIGNMENTS / REDUCE_OUTPUT_RECORDS; + } //else skip this entity (it is not placed in any block) + } + + +} diff --git a/MetaBlocking/src/main/java/preprocessing/EntityIndexReducerNoFiltering.java b/MetaBlocking/src/main/java/preprocessing/EntityIndexReducerNoFiltering.java new file mode 100644 index 0000000..a00545d --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/EntityIndexReducerNoFiltering.java @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + + +import java.io.IOException; +import java.util.Iterator; +import java.util.Set; +import java.util.TreeSet; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +public class EntityIndexReducerNoFiltering extends MapReduceBase implements Reducer { + + + static enum OutputData {D1Entities, D2Entities, BLOCK_ASSIGNMENTS}; + + /** + * Builds the Entity Index, after performing Block Filtering + * To skip the block filtering part, just output all the blocks and not the top MAX_BLOCKS + * by commenting out the specified line + * input _key: entity id + * input values: block ids of the current entity + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + Set toEmit = new TreeSet<>(); //to sort the blocks in ascending order + while (values.hasNext()) { + Integer block = values.next().get(); //the block id + toEmit.add(new VIntWritable(block)); + } + +/* //transform the set to an array, which will be the final output (toEmitFinal) + VIntWritable[] toEmitArray = new VIntWritable[toEmit.size()]; + toEmitArray = toEmit.toArray(toEmitArray); + VIntArrayWritable toEmitFinal = new VIntArrayWritable(toEmitArray);*/ + + VIntArrayWritable toEmitFinal = hadoopUtils.RelativePositionCompression.compress(toEmit); + + if (_key.get() >= 0) { + reporter.incrCounter(OutputData.D1Entities, 1); + } else { + reporter.incrCounter(OutputData.D2Entities, 1); + } + output.collect(_key, toEmitFinal); + + reporter.incrCounter(OutputData.BLOCK_ASSIGNMENTS, toEmit.size()); + //BC = BLOCK_ASSIGNMENTS / REDUCE_OUTPUT_RECORDS; + } + + +} diff --git a/MetaBlocking/src/main/java/preprocessing/EntityPruningCombiner.java b/MetaBlocking/src/main/java/preprocessing/EntityPruningCombiner.java new file mode 100644 index 0000000..14d558d --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/EntityPruningCombiner.java @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import java.io.IOException; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +import preprocessing.BasicEntityPruningMapper.InputData; + +public class EntityPruningCombiner extends MapReduceBase implements Reducer { + + //Text next = new Text(); + //VIntArrayWritable toEmit = new VIntArrayWritable(); + //VIntArrayWritable empty = new VIntArrayWritable(new VIntWritable[0]); + VIntWritable dummy = new VIntWritable(-1); + + + /** + * merges two sets of entity neighbors into one list (containing duplicates with max 2 occurences) + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + Set entities = new HashSet<>(); + // boolean first = true; + + while (values.hasNext()) { + //VIntArrayWritable nextValue = values.next(); + VIntWritable next = values.next(); +// if (first && !values.hasNext()) { //only one value => forward input to reducer +// output.collect(_key, next); +// return; +// } + //VIntWritable[] next = nextValue.get(); + reporter.progress(); + //first = false; + +// if (next.length == 0) { +// output.collect(_key, empty); +// reporter.incrCounter(InputData.NON_SINGLETON_INPUT, 1); +// return; +// } + if (next.get() == -1) { + output.collect(_key, dummy); + reporter.incrCounter(InputData.NON_SINGLETON_INPUT, 1); + return; + } + +// for (VIntWritable entity : next) { + reporter.progress(); + if (entities.add(next) == false) { //entity is nonSingular + output.collect(_key, dummy); + reporter.incrCounter(InputData.NON_SINGLETON_FOUND, 1); + return; + } +// } + } + + //at this point, all the neighbor entities are unique, so continue... +// VIntWritable[] toEmitArray = new VIntWritable[entities.size()]; +// toEmitArray = entities.toArray(toEmitArray); +// reporter.progress(); +// toEmit.set(toEmitArray); +// +// output.collect(_key, toEmit); + for (VIntWritable entity: entities) { + output.collect(_key, entity); + } + } +} diff --git a/MetaBlocking/src/main/java/preprocessing/EntityPruningDirtyFinalMapper.java b/MetaBlocking/src/main/java/preprocessing/EntityPruningDirtyFinalMapper.java new file mode 100644 index 0000000..b9f3d41 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/EntityPruningDirtyFinalMapper.java @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.StringTokenizer; + +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +public class EntityPruningDirtyFinalMapper extends MapReduceBase implements Mapper { + + VIntArrayWritable toEmitFinal = new VIntArrayWritable(); + + private Path[] localFiles; + private Set nonSingulars; + public void configure(JobConf conf) { + + nonSingulars = new HashSet<>(); + BufferedReader SW; + try { + localFiles = DistributedCache.getLocalCacheFiles(conf); + SW = new BufferedReader(new FileReader(localFiles[0].toString())); + String line; + while ((line = SW.readLine()) != null) { + nonSingulars.add(new VIntWritable(Integer.parseInt(line))); + } + SW.close(); + } catch (FileNotFoundException e) { + System.err.println(e.toString()); + } catch (IOException e) { + System.err.println(e.toString()); + } + } + + /** + * input: a blocking collection + * input key: block id + * input value: entity ids in this block, separated by "," + * output key: entity id (each of the input values) + * output value: entity ids separated by " " (neighbors of output key) + */ + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + + reporter.setStatus("splitting the block "+key); + + List entities = new ArrayList<>(); + + + StringTokenizer tok = new StringTokenizer(value.toString(),"#"); + + //split the bilateral block in two (clean-clean ER) + for (Integer entity = Integer.parseInt(tok.nextToken()); tok.hasMoreElements(); entity=Integer.parseInt(tok.nextToken())) { + //for (String entity : entities) { + if (entity == null) { continue; } + entities.add(new VIntWritable(entity)); + reporter.progress(); + } + + + if (entities.size() < 2) { + return; + } + entities.retainAll(nonSingulars); //keep only nonSingular entities (discard singular entities) + if (entities.size() < 2) { //Discards blocks with no comparisons + return; + } + + VIntWritable[] toEmitArray = new VIntWritable[entities.size()]; + toEmitArray = entities.toArray(toEmitArray); + toEmitFinal.set(toEmitArray); + + output.collect(key, toEmitFinal); //rewrite the blocks with only nonSingulars + + } + + +} diff --git a/MetaBlocking/src/main/java/preprocessing/EntityPruningDriver.java b/MetaBlocking/src/main/java/preprocessing/EntityPruningDriver.java new file mode 100644 index 0000000..ce12dd5 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/EntityPruningDriver.java @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.io.compress.BZip2Codec; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; +import org.apache.hadoop.mapred.TextOutputFormat; +import org.apache.hadoop.mapred.lib.IdentityMapper; + + +public class EntityPruningDriver { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(preprocessing.EntityPruningDriver.class); + + conf.setJobName("Entity Pruning (1rst job)"); + + conf.setOutputKeyClass(VIntWritable.class); + //conf.setOutputValueClass(VIntWritable.class); + conf.setOutputValueClass(NullWritable.class); + //conf.setOutputValueClass(Text.class); + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(VIntArrayWritable.class); + + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(TextOutputFormat.class); + + conf.setCompressMapOutput(true); +// conf.setMapOutputCompressorClass(BZip2Codec.class); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //blocking collection + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //block construction without singulars + + conf.setMapperClass(preprocessing.EntityPruningMapper.class); +// conf.setCombinerClass(preprocessing.EntityPruningCombiner.class); + conf.setReducerClass(preprocessing.EntityPruningReducer.class); + + + conf.set("mapred.reduce.slowstart.completed.maps", "1.0"); + conf.setInt("io.sort.mb", 700); //default 100 + conf.setFloat("io.sort.spill.percent", 0.9f); //default 0.8 + conf.setInt("io.sort.factor", 100); //default 10 + conf.setInt("mapred.task.timeout", 1800000); + conf.setNumReduceTasks(224); + //conf.setNumReduceTasks(0); + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} diff --git a/MetaBlocking/src/main/java/preprocessing/EntityPruningFinalDriver.java b/MetaBlocking/src/main/java/preprocessing/EntityPruningFinalDriver.java new file mode 100644 index 0000000..533ad0f --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/EntityPruningFinalDriver.java @@ -0,0 +1,113 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.HashSet; +import java.util.Set; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.io.VLongWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; +import org.apache.hadoop.mapred.lib.IdentityMapper; + + +public class EntityPruningFinalDriver { + + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(preprocessing.EntityPruningFinalDriver.class); + + conf.setJobName("Entity Pruning Final Step"); + + conf.setOutputKeyClass(VIntWritable.class); //block id + conf.setOutputValueClass(VIntArrayWritable.class); //list of VIntWritables (entity ids of this block) + +// conf.setMapOutputKeyClass(VIntWritable.class); +// conf.setMapOutputValueClass(VIntWritable.class); +// + conf.setInputFormat(SequenceFileInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + // conf.setCompressMapOutput(true); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //input blocking collection (initial input) + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //blocking collection without singulars and unary blocks + + conf.setMapperClass(preprocessing.EntityPruningFinalMapper.class); +// conf.setMapperClass(IdentityMapper.class); +// conf.setReducerClass(preprocessing.EntityPruningFinalReducer.class); + + //conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + + conf.setNumReduceTasks(0); + + Set nonSingulars = new HashSet<>(); + try{ + System.out.println("Retrieving nonSingular entities..."); + Path pt=new Path("/user/hduser/nonSingulars.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(pt))); + String line; + int lineCounter = 0; + while ((line = br.readLine()) != null) { + nonSingulars.add(line); + if ((++lineCounter % 10000) == 0) { + System.out.println(lineCounter/1000+"K lines read..."); + } + } + br.close(); + System.out.println(nonSingulars.size()+" nonSingular entities successfully retrieved."); + System.out.println("Creating nonSingulars file..."); + String uniquePath = "/user/hduser/nonSingularsUnique.txt"; + Path pt2=new Path(uniquePath); + BufferedWriter br2=new BufferedWriter(new OutputStreamWriter(fs.create(pt2,true))); + for (String nonSingular : nonSingulars) { + br2.write(nonSingular); + br2.newLine(); + } + br2.close(); + DistributedCache.addCacheFile(new URI(uniquePath), conf); + }catch (URISyntaxException e1) { + System.err.println(e1.toString()); + }catch(Exception e){ + System.err.println(e.toString()); + } + conf.set("mapred.user.jobconf.limit", "10485760"); + +// try { +// DistributedCache.addCacheFile(new URI("/user/hduser/nonSingulars.txt"), conf); // nonSingular entities (with duplicate entries) +// } catch (URISyntaxException e1) { +// System.err.println(e1.toString()); +// } + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} diff --git a/MetaBlocking/src/main/java/preprocessing/EntityPruningFinalMapper.java b/MetaBlocking/src/main/java/preprocessing/EntityPruningFinalMapper.java new file mode 100644 index 0000000..406ee58 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/EntityPruningFinalMapper.java @@ -0,0 +1,109 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.StringTokenizer; +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +public class EntityPruningFinalMapper extends MapReduceBase implements Mapper { + + private Path[] localFiles; + private Set nonSingulars; + public void configure(JobConf conf) { + + nonSingulars = new HashSet<>(); + BufferedReader SW; + try { + localFiles = DistributedCache.getLocalCacheFiles(conf); + SW = new BufferedReader(new FileReader(localFiles[0].toString())); + String line; + while ((line = SW.readLine()) != null) { + nonSingulars.add(new VIntWritable(Integer.parseInt(line))); + } + SW.close(); + } catch (FileNotFoundException e) { + System.err.println(e.toString()); + } catch (IOException e) { + System.err.println(e.toString()); + } + } + + /** + * input: a blocking collection + * input key: block id + * input value: entity ids in this block, separated by "," + * output key: entity id (each of the input values) + * output value: entity ids separated by " " (neighbors of output key) + */ + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + + reporter.setStatus("splitting the block "+key); + //String []entities = value.toString().split(","); + List D1entities = new ArrayList<>(); + List D2entities = new ArrayList<>(); + + String valueString = value.toString().replaceFirst(";", ""); + StringTokenizer tok = new StringTokenizer(valueString,"#"); + + //split the bilateral block in two (clean-clean ER) + for (Integer entity = Integer.parseInt(tok.nextToken()); tok.hasMoreElements(); entity=Integer.parseInt(tok.nextToken())) { + //for (String entity : entities) { + if (entity == null) { continue; } + if (entity > 0) { + D1entities.add(new VIntWritable(entity)); + } else { + D2entities.add(new VIntWritable(entity)); + } + reporter.progress(); + } + + + if (D1entities.isEmpty() || D2entities.isEmpty()) { //clean-clean ER + return; + } + D1entities.retainAll(nonSingulars); + D2entities.retainAll(nonSingulars); + if (D1entities.isEmpty() || D2entities.isEmpty()) { //clean-clean ER. Discards blocks with no comparisons + return; + } + + D1entities.addAll(D2entities); + VIntWritable[] toEmitArray = new VIntWritable[D1entities.size()]; + toEmitArray = D1entities.toArray(toEmitArray); + reporter.progress(); + VIntArrayWritable toEmitFinal = new VIntArrayWritable(toEmitArray); + +// StringBuffer outputValue = new StringBuffer(); +// for (VIntWritable e1 : D1entities) { +// outputValue.append(e1+"#"); +// } +// outputValue.append(";"); +// for (VIntWritable e2 : D2entities) { +// outputValue.append(e2+"#"); +// } + //output.collect(key, new Text(outputValue.toString())); //rewrite the blocks with only nonSingulars + output.collect(key, toEmitFinal); //rewrite the blocks with only nonSingulars + + } + + +} diff --git a/MetaBlocking/src/main/java/preprocessing/EntityPruningFinalReducer.java b/MetaBlocking/src/main/java/preprocessing/EntityPruningFinalReducer.java new file mode 100644 index 0000000..059d960 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/EntityPruningFinalReducer.java @@ -0,0 +1,33 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import java.io.IOException; +import java.util.Iterator; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +public class EntityPruningFinalReducer extends MapReduceBase implements Reducer { + + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + reporter.setStatus("reducing "+_key); + StringBuffer toEmit = new StringBuffer(); + boolean unary = true; + while (values.hasNext()) { + toEmit.append("#"+values.next().get()); + reporter.progress(); + if (values.hasNext()) { unary = false; } + } + if (!unary) { + output.collect(_key, new Text(toEmit.toString().substring(1))); //substring(1) to remove the first ',' + } + } + +} diff --git a/MetaBlocking/src/main/java/preprocessing/EntityPruningMapper.java b/MetaBlocking/src/main/java/preprocessing/EntityPruningMapper.java new file mode 100644 index 0000000..36fcf8e --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/EntityPruningMapper.java @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.StringTokenizer; + +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +public class EntityPruningMapper extends MapReduceBase implements Mapper { + + static enum InputData {BUF_LIMIT_REACHED}; + + //VIntArrayWritable toEmitFinal = new VIntArrayWritable(); + + final int BUF_LIMIT = 10000; //write up to this number of values each time + /** + * input: a blocking collection + * input key: block id + * input value: entity ids in this block, separated by "," + * output key: entity id (each of the input values) + * output value: entity ids separated by " " (neighbors of output key) + */ + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + + reporter.setStatus("splitting the block "+key); + //String []entities = value.toString().split(","); + List D1entities = new ArrayList<>(); + //List D2entities = new ArrayList<>(); + + String valueString = value.toString().replaceFirst(";", ""); + StringTokenizer tok = new StringTokenizer(valueString,"#"); + List buffer = new ArrayList<>(); + + //StringBuffer outputValue = new StringBuffer(); + List outputValue = new ArrayList<>(); + + int tmpCounter = 0; + //split the bilateral block in two (clean-clean ER) + for (Integer entity = Integer.parseInt(tok.nextToken()); tok.hasMoreElements(); entity=Integer.parseInt(tok.nextToken())) { + if (entity == null) { continue; } + if (entity > 0) { + D1entities.add(new VIntWritable(entity)); + } else { + //D2entities.add(new VIntWritable(entity)); + outputValue.add(new VIntWritable(entity)); + if (++tmpCounter == BUF_LIMIT) { + reporter.incrCounter(InputData.BUF_LIMIT_REACHED, 1); + + VIntWritable[] toEmitArray = new VIntWritable[outputValue.size()]; + toEmitArray = outputValue.toArray(toEmitArray); + buffer.add(new VIntArrayWritable(toEmitArray)); + + outputValue = new ArrayList<>(); + tmpCounter = 0; + + reporter.progress(); + } + } + reporter.progress(); + } + + if (outputValue.size() > 1) { + VIntWritable[] toEmitArray = new VIntWritable[outputValue.size()]; + toEmitArray = outputValue.toArray(toEmitArray); + buffer.add(new VIntArrayWritable(toEmitArray)); + } + + if (buffer.isEmpty() || D1entities.isEmpty()) { + return; + } + +// VIntWritable[] toEmitArray = new VIntWritable[outputValue.size()]; +// toEmitArray = outputValue.toArray(toEmitArray); +// reporter.progress(); +// toEmitFinal.set(toEmitArray); + + int counter = 0; + for (VIntWritable e1 : D1entities) { + reporter.setStatus(++counter+"/"+D1entities.size()); + for (VIntArrayWritable bufferedArray : buffer) { //emit all stored neighbors + output.collect(e1, bufferedArray); + } + } + + + + + + + } + + +} diff --git a/MetaBlocking/src/main/java/preprocessing/EntityPruningMapperNew.java b/MetaBlocking/src/main/java/preprocessing/EntityPruningMapperNew.java new file mode 100644 index 0000000..e13e2a9 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/EntityPruningMapperNew.java @@ -0,0 +1,112 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.StringTokenizer; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +public class EntityPruningMapperNew extends MapReduceBase implements Mapper { + + //static enum InputData {BUF_LIMIT_REACHED}; + static enum InputData {NON_SINGLETON_INPUT, NON_SINGLETON_FOUND}; + + VIntArrayWritable toEmitFinal = new VIntArrayWritable(); + + //final int BUF_LIMIT = 10000; //write up to this number of values each time + /** + * input: a blocking collection + * input key: block id + * input value: entity ids in this block, separated by "," + * output key: entity id (each of the input values) + * output value: entity ids separated by " " (neighbors of output key) + */ + public void map(VIntWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + + reporter.setStatus("splitting the block "+key); + List D1entities = new ArrayList<>(); + List D2entities = new ArrayList<>(); + + String valueString = value.toString().replaceFirst(";", ""); + StringTokenizer tok = new StringTokenizer(valueString,"#"); + //List buffer = new ArrayList<>(); + +// int tmpCounter = 0; + //split the bilateral block in two (clean-clean ER) + for (Integer entity = Integer.parseInt(tok.nextToken()); tok.hasMoreElements(); entity=Integer.parseInt(tok.nextToken())) { + if (entity == null) { continue; } + if (entity > 0) { + D1entities.add(new VIntWritable(entity)); + } else { + D2entities.add(new VIntWritable(entity)); + //outputValue.add(new VIntWritable(entity)); +// if (++tmpCounter == BUF_LIMIT) { +// reporter.incrCounter(InputData.BUF_LIMIT_REACHED, 1); +// +// VIntWritable[] toEmitArray = new VIntWritable[outputValue.size()]; +// toEmitArray = outputValue.toArray(toEmitArray); +// buffer.add(new VIntArrayWritable(toEmitArray)); +// +// outputValue = new ArrayList<>(); +// tmpCounter = 0; +// +// reporter.progress(); +// } + } + reporter.progress(); + } + +// if (outputValue.size() > 1) { +// VIntWritable[] toEmitArray = new VIntWritable[outputValue.size()]; +// toEmitArray = outputValue.toArray(toEmitArray); +// buffer.add(new VIntArrayWritable(toEmitArray)); +// } + + if (D1entities.isEmpty() || D2entities.isEmpty()) { + return; + } + + VIntWritable[] toEmitD1Array = new VIntWritable[D1entities.size()]; + toEmitD1Array = D1entities.toArray(toEmitD1Array); + reporter.progress(); + toEmitFinal.set(toEmitD1Array); + + reporter.setStatus("finding the D2 comparisons of "+key); + for (VIntWritable e2 : D2entities) { +// for (VIntArrayWritable bufferedArray : buffer) { //emit all stored neighbors + output.collect(e2, toEmitFinal); +// } + } + + //do the same for D1 + VIntWritable[] toEmitD2Array = new VIntWritable[D2entities.size()]; + toEmitD1Array = D2entities.toArray(toEmitD2Array); + reporter.progress(); + toEmitFinal.set(toEmitD2Array); + + reporter.setStatus("finding the D1 comparisons of "+key); + for (VIntWritable e1 : D1entities) { +// for (VIntArrayWritable bufferedArray : buffer) { //emit all stored neighbors + output.collect(e1, toEmitFinal); +// } + } + + + + + + + } + + +} diff --git a/MetaBlocking/src/main/java/preprocessing/EntityPruningReducer.java b/MetaBlocking/src/main/java/preprocessing/EntityPruningReducer.java new file mode 100644 index 0000000..bacaa12 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/EntityPruningReducer.java @@ -0,0 +1,109 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + + +import java.io.IOException; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +public class EntityPruningReducer extends MapReduceBase implements Reducer { + + + private final NullWritable NULL = NullWritable.get(); + + + private Set nonSingulars; + public void configure (JobConf job) { + nonSingulars = new HashSet<>(); + } + + + /** + * Removes singular entities from the blocks + * input _key: enitty id + * input values: entities sharing a block with the key entity + * output key: entity id of the non-singular entities + * output value: nothing + */ + /* + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + Set entities = new HashSet<>(); + Set blocks = new HashSet<>(); + + boolean singular = true; + + while (values.hasNext()) { + String[] value = values.next().toString().split(" "); + VIntWritable entity = new VIntWritable(Integer.parseInt(value[0])); + if (singular) { + if (entities.add(entity) == false) { //comparison is repeated + singular = false; + } + } + blocks.add(new VIntWritable(Integer.parseInt(value[1]))); + } + + if (!singular) { + for (VIntWritable block : blocks) { + output.collect(block, _key); + } + } +} + */ + + + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + Set entities = new HashSet<>(); + + boolean singular = true; + while (values.hasNext()) { + VIntWritable[] next = values.next().get(); + for (VIntWritable entity : next) { + if (entities.add(entity) == false) { //entity is nonSingular + singular = false; + if (nonSingulars.add(entity) == true) { //nonSingular entity added for first time (in this reducer!) + output.collect(entity, NULL); //emit the entity as nonSingular (only once) + } + } + + } + } + + if (!singular) { + if (nonSingulars.add(_key) == true) { //nonSingular entity added for first time (in this reducer!) + output.collect(_key, NullWritable.get()); + } + } + } + + + //SOLUTION 2 + //DO NOT DELETE! KEEP FOR BACKUP (WORKING!) SOLUTION + /*values.next(); //no need to check here; there is always at least one value + if (values.hasNext()) { //comparison appears at least twice + String[] comparison = _key.toString().split(" "); + String e1 = comparison[0]; + String e2 = comparison[1]; + output.collect(new Text(e1), NullWritable.get()); + output.collect(new Text(e2), NullWritable.get()); + } + + + }*/ + +} diff --git a/MetaBlocking/src/main/java/preprocessing/EntityPruningReducerNew.java b/MetaBlocking/src/main/java/preprocessing/EntityPruningReducerNew.java new file mode 100644 index 0000000..512f451 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/EntityPruningReducerNew.java @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + + +import java.io.IOException; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +public class EntityPruningReducerNew extends MapReduceBase implements Reducer { + + + private final NullWritable NULL = NullWritable.get(); + + + + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + Set entities = new HashSet<>(); + boolean first = true; + + while (values.hasNext()) { + if (first && !values.hasNext()) { //only one value => no repeated comparisons => singular + return; + } + first = false; + VIntWritable[] next = values.next().get(); + if (next.length == 0) { + output.collect(_key, NULL); //emit the entity as nonSingular (only once) + return; + } + for (VIntWritable entity : next) { + if (entities.add(entity) == false) { //entity is nonSingular + output.collect(_key, NULL); //emit the entity as nonSingular (only once) + return; + } + + } + } + } + +} diff --git a/MetaBlocking/src/main/java/preprocessing/ExtendedInputDriver.java b/MetaBlocking/src/main/java/preprocessing/ExtendedInputDriver.java new file mode 100644 index 0000000..9b428b8 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/ExtendedInputDriver.java @@ -0,0 +1,105 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.OutputStreamWriter; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.Counters; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RunningJob; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; + +import preprocessing.ExtendedInputReducer.OutputData; + + + +public class ExtendedInputDriver extends Configured { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(preprocessing.ExtendedInputDriver.class); + + conf.setJobName("Extended Input"); + + conf.setMapOutputKeyClass(VIntWritable.class); + conf.setMapOutputValueClass(VIntArrayWritable.class); + + conf.setOutputKeyClass(VIntWritable.class); //block id + conf.setOutputValueClass(Text.class); //list of entities in this block, along with their other blocks + + conf.setInputFormat(SequenceFileInputFormat.class); + //conf.setOutputFormat(TextOutputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); //Entity Index + FileOutputFormat.setOutputPath(conf, new Path(args[1])); //extended input file (blocking collection with entity index) + + conf.setMapperClass(preprocessing.ExtendedInputMapper.class); +// conf.setMapperClass(preprocessing.ExtendedInputMapperARCS.class); + conf.setReducerClass(preprocessing.ExtendedInputReducer.class); + + conf.setInt("mapred.task.timeout", 10000000); + conf.set("mapred.reduce.slowstart.completed.maps", "1.00"); + conf.setMaxReduceTaskFailuresPercent(10); //acceptable failures before the whole job fails + conf.set("mapred.reduce.max.attempts", "10"); //before it is not started again + conf.set("mapred.max.tracker.failures", "100"); //before it gets black-listed + conf.set("mapred.job.tracker.handler.count", "40"); + + conf.setNumReduceTasks(560); + + conf.setCompressMapOutput(true); + + client.setConf(conf); + RunningJob job = null; + try { + job = JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + + //the following is used only for CNP,CEPTotalOrder but does not create any overhead (keep it always) + if (job == null) { + System.err.println("No job found"); + return; + } + + BufferedWriter bwClean = null; + BufferedWriter bwDirty = null; + try { + Counters counters = job.getCounters(); + Long dirtyBlocks = counters.findCounter("org.apache.hadoop.mapred.Task$Counter", + "REDUCE_OUTPUT_RECORDS").getCounter(); + Long cleanBlocks = counters.findCounter(OutputData.CLEAN_BLOCKS).getCounter(); + Path cleanPath=new Path("/user/hduser/numBlocksClean.txt"); + Path dirtyPath=new Path("/user/hduser/numBlocksDirty.txt"); + FileSystem fs = FileSystem.get(new Configuration()); + bwClean = new BufferedWriter(new OutputStreamWriter(fs.create(cleanPath,true))); + bwDirty = new BufferedWriter(new OutputStreamWriter(fs.create(dirtyPath,true))); + bwClean.write(cleanBlocks.toString()); + bwDirty.write(dirtyBlocks.toString()); + } catch (IllegalArgumentException | IOException e) { + System.err.println(e.toString()); + } finally { + try { bwClean.close(); bwDirty.close(); } + catch (IOException e) { System.err.println(e.toString());} + } + } + + + +} diff --git a/MetaBlocking/src/main/java/preprocessing/ExtendedInputMapper.java b/MetaBlocking/src/main/java/preprocessing/ExtendedInputMapper.java new file mode 100644 index 0000000..6d2cabe --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/ExtendedInputMapper.java @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import java.io.IOException; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +import preprocessing.VIntArrayWritable; + + + +public class ExtendedInputMapper extends MapReduceBase implements Mapper { + + VIntArrayWritable toEmit = new VIntArrayWritable(); + + /** + * maps an input entity index into (key, value) pair(s) + * the value is the entity id (input key) along with the ids of blocks that contain it + * the key each time is a block id (each element of the input value array) + * @param key an entity id + * @param value an array of block ids that this entity belongs to + * @param output key: a block id (each element of the input value array) - value: the entity id (input key), the ids of blocks containing this entity (Bi) + */ + public void map(VIntWritable key, VIntArrayWritable value, + OutputCollector output, Reporter reporter) throws IOException { + + VIntWritable [] Bi = value.get(); + VIntWritable[] iWithBi = new VIntWritable[Bi.length+1]; + + iWithBi[0] = key; //the first element is the entity i + System.arraycopy(Bi, 0, iWithBi, 1, Bi.length);//the remaining elements are the blocks of i (Bi) + + toEmit.set(iWithBi); + + //VIntWritable[] uncompressed = hadoopUtils.RelativePositionCompression.uncompress(value).get(); + for (VIntWritable bi : Bi) { + output.collect(bi, toEmit); + } + + } + +} diff --git a/MetaBlocking/src/main/java/preprocessing/ExtendedInputMapperARCS.java b/MetaBlocking/src/main/java/preprocessing/ExtendedInputMapperARCS.java new file mode 100644 index 0000000..cf82952 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/ExtendedInputMapperARCS.java @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import java.io.IOException; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + + + +public class ExtendedInputMapperARCS extends MapReduceBase implements Mapper { + + VIntArrayWritable toEmit = new VIntArrayWritable(); + + /** + * maps an input entity index into (key, value) pair(s) + * the value is the entity id (input key) along with the ids of blocks that contain it and the size of each block + * the key each time is a block id (each element of the input value array) + * @param key an entity id + * @param value an array of [blockId,blockSize] of the blocks that this entity belongs to + * @param output key: a block id (each element of the input value array) - value: the entity id (input key), the ids of blocks containing this entity (Bi), each along with its size |b| + */ + public void map(VIntWritable key, VIntArrayWritable value, + OutputCollector output, Reporter reporter) throws IOException { + + VIntWritable [] Bi = value.get(); + VIntWritable[] iWithBi = new VIntWritable[Bi.length+1]; + iWithBi[0] = key; //the first element is the entity i + reporter.progress(); + System.arraycopy(Bi, 0, iWithBi, 1, Bi.length); //the remaining elements are the blocks of i (Bi) + reporter.progress(); + toEmit.set(iWithBi); + + //VIntWritable[] uncompressed = hadoopUtils.RelativePositionCompression.uncompress(value).get(); + for (int i = 0; i < Bi.length; i+=2) { //i+=2 to skip blockSizes + VIntWritable bi = Bi[i]; + output.collect(bi, toEmit); + } + + } + +} diff --git a/MetaBlocking/src/main/java/preprocessing/ExtendedInputReducer.java b/MetaBlocking/src/main/java/preprocessing/ExtendedInputReducer.java new file mode 100644 index 0000000..2b2e977 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/ExtendedInputReducer.java @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Iterator; + + +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + + +public class ExtendedInputReducer extends MapReduceBase implements Reducer { + + Text toEmit = new Text(); + + static enum OutputData {CLEAN_BLOCKS}; + + /** + * @param _key a block id (each element of the input value array) - + * @param values a list of (entity id i, the ids and sizes of blocks containing this entity (Bi,|Bi|)) + * @param output key: block id (same as input key). value: a textual concatenation of input int arrays + * + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + StringBuilder toEmitBuffer = new StringBuilder(); + boolean atLeastTwoEntities = false; + boolean containsNegative = false; + boolean containsPositive = false; + while (values.hasNext()) { + if (toEmitBuffer.length() > 1) { + atLeastTwoEntities = true; + } + VIntWritable[] entityWithBlocks = values.next().get(); + toEmitBuffer.append(Arrays.toString(entityWithBlocks)); + if (!containsNegative && entityWithBlocks[0].get() < 0) { + containsNegative = true; + } + if (!containsPositive && entityWithBlocks[0].get() >= 0) { + containsPositive = true; + } + } + if (atLeastTwoEntities) { + toEmit.set(toEmitBuffer.toString()); + output.collect(_key, toEmit); + if (containsNegative && containsPositive) { //a valid block for clean-clean ER + reporter.incrCounter(OutputData.CLEAN_BLOCKS, 1); + } //DIRTY_BLOCKS = REDUCE_OUTPUT_RECORDS + } //PURGED_BLOCKS = REDUCE_INPUT_GROUPS - REDUCE_OUTPUT_RECORDS + } + +} diff --git a/MetaBlocking/src/main/java/preprocessing/ExtendedInputReducerNew.java b/MetaBlocking/src/main/java/preprocessing/ExtendedInputReducerNew.java new file mode 100644 index 0000000..156fa58 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/ExtendedInputReducerNew.java @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + + +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + + +public class ExtendedInputReducerNew extends MapReduceBase implements Reducer { + + private final VIntWritable DELIM = new VIntWritable(Integer.MIN_VALUE); + + static enum OutputData {CLEAN_BLOCKS}; + + /** + * @param _key a block id (each element of the input value array) - + * @param values a list of (entity id i, the ids and sizes of blocks containing this entity (Bi,|Bi|)) + * @param output key: block id (same as input key). value: a concatenation of input int arrays + * + */ + public void reduce(VIntWritable _key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + + List inputList = new ArrayList<>(); + boolean atLeastTwoEntities = false; + boolean containsNegative = false; + boolean containsPositive = false; + while (values.hasNext()) { + if (inputList.size() > 1) { + atLeastTwoEntities = true; + } + VIntWritable[] entityWithBlocks = values.next().get(); + inputList.addAll(Arrays.asList(entityWithBlocks)); + inputList.add(DELIM); + // toEmitBuffer.append(Arrays.toString(entityWithBlocks)); + if (!containsNegative && entityWithBlocks[0].get() < 0) { + containsNegative = true; + } + if (!containsPositive && entityWithBlocks[0].get() >= 0) { + containsPositive = true; + } + } + if (atLeastTwoEntities) { + inputList.remove(inputList.size()-1); //remove the last element (i.e., the last DELIM) + VIntWritable[] tmpArray = new VIntWritable[inputList.size()]; + + output.collect(_key, new VIntArrayWritable(inputList.toArray(tmpArray))); + if (containsNegative && containsPositive) { //a valid block for clean-clean ER + reporter.incrCounter(OutputData.CLEAN_BLOCKS, 1); + } //DIRTY_BLOCKS = REDUCE_OUTPUT_RECORDS + } //PURGED_BLOCKS = REDUCE_INPUT_GROUPS - REDUCE_OUTPUT_RECORDS + } + +} diff --git a/MetaBlocking/src/main/java/preprocessing/TextToSequence.java b/MetaBlocking/src/main/java/preprocessing/TextToSequence.java new file mode 100644 index 0000000..429cb72 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/TextToSequence.java @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; +import org.apache.hadoop.mapred.TextInputFormat; +import org.apache.hadoop.mapred.lib.IdentityReducer; + +public class TextToSequence { + + public static void main(String[] args) { + JobClient client = new JobClient(); + JobConf conf = new JobConf(preprocessing.TextToSequence.class); + + conf.setJobName("Text to SequnceFileFormat"); + + conf.setOutputKeyClass(VIntWritable.class); + conf.setOutputValueClass(VIntArrayWritable.class); + + + conf.setInputFormat(TextInputFormat.class); + conf.setOutputFormat(SequenceFileOutputFormat.class); + SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); + FileOutputFormat.setOutputPath(conf, new Path(args[1])); + + conf.setMapperClass(preprocessing.TextToSequenceMapperArrayWritable.class); + conf.setReducerClass(IdentityReducer.class); //used for load balancing + + conf.setInt("mapred.task.timeout", 800000); + + //conf.setNumReduceTasks(0); //no reducer + conf.setNumReduceTasks(224); +// conf.setNumReduceTasks(112); + + client.setConf(conf); + try { + JobClient.runJob(conf); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} diff --git a/MetaBlocking/src/main/java/preprocessing/TextToSequenceMapper.java b/MetaBlocking/src/main/java/preprocessing/TextToSequenceMapper.java new file mode 100644 index 0000000..b88e4bc --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/TextToSequenceMapper.java @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import java.io.IOException; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + + + +public class TextToSequenceMapper extends MapReduceBase implements Mapper { + + + //static enum InputData {NULL_ENTITY}; + + VIntWritable blockId = new VIntWritable(); + Text entities = new Text(); + /** + * input blocks: blockid (int) \t entity ids (ints, string separated) + * output the same with blockid as VIntWritable and block contents as Text + * + */ + public void map(LongWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + + + String[] block = value.toString().split("\t"); + blockId.set(Integer.parseInt(block[0])); + entities.set(block[1]); + //if (blockId.get() == 950828) { //FIXME: delete (just used for debugging) +// reporter.setStatus(entities.toString()); +// blockId.set(1); //just for debugging +// entities.set("1"); //just for debugging + //} + output.collect(blockId, entities); + } + +} diff --git a/MetaBlocking/src/main/java/preprocessing/TextToSequenceMapperArrayWritable.java b/MetaBlocking/src/main/java/preprocessing/TextToSequenceMapperArrayWritable.java new file mode 100644 index 0000000..7e77895 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/TextToSequenceMapperArrayWritable.java @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + + + +public class TextToSequenceMapperArrayWritable extends MapReduceBase implements Mapper { + + + //static enum InputData {NULL_ENTITY}; + + VIntWritable blockId = new VIntWritable(); + + /** + * input blocks: blockid (int) \t entity ids (ints, string separated) + * output the same with blockid as VIntWritable and block contents as Text + * + */ + public void map(LongWritable key, Text value, + OutputCollector output, Reporter reporter) throws IOException { + + + String[] block = value.toString().split("\t"); + blockId.set(Integer.parseInt(block[0])); + String[] entities = (block[1].split("#")); + List toEmitList = new ArrayList<>(); + for (String entity : entities) { + toEmitList.add(new VIntWritable(Integer.parseInt(entity))); + } + + VIntWritable[] toEmitArray = new VIntWritable[toEmitList.size()]; + VIntArrayWritable toEmit = new VIntArrayWritable(toEmitList.toArray(toEmitArray)); + + output.collect(blockId, toEmit); + } + +} diff --git a/MetaBlocking/src/main/java/preprocessing/VIntArrayWritable.java b/MetaBlocking/src/main/java/preprocessing/VIntArrayWritable.java new file mode 100644 index 0000000..5b92345 --- /dev/null +++ b/MetaBlocking/src/main/java/preprocessing/VIntArrayWritable.java @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2015 Vasilis Efthymiou + */ +package preprocessing; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.Arrays; + +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.VIntWritable; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableUtils; + +public class VIntArrayWritable extends ArrayWritable { + + public VIntArrayWritable() { + super(VIntWritable.class); + } + + public VIntArrayWritable(VIntWritable[] values) { + super(VIntWritable.class, values); + } + + @Override + public VIntWritable[] get() { + return (VIntWritable[]) super.get(); + } + + @Override + public void set(Writable[] values) { + super.set((VIntWritable[]) values); + } + + @Override + public String toString() { + return Arrays.toString(get()); + } + + public void write(DataOutput out) { + try { + //first store a counter for the size of the data written + out.writeInt(get().length); + //out.writeShort(get().length); //2 bytes range = [-32768, 32767] + //out.writeByte(get().length); //use this instead, if maximum size < 256 = 2^8 + //writeByte just reduces disk usage by |entities| bytes (few MBs) + for (VIntWritable i : get()) { + //out.writeInt(i.get()); + //i.write(out); + WritableUtils.writeVInt(out, i.get()); + } + } catch (IOException e) { + e.printStackTrace(); + } + } + + public void readFields(DataInput in) { + try { + int counter = in.readInt(); + //short counter = in.readShort(); + //byte counter = in.readByte(); //use this instead, if writeByte is used in write() + + VIntWritable[] values = new VIntWritable[counter]; + for (int i = 0; i < counter; ++i){ + values[i] = new VIntWritable(WritableUtils.readVInt(in)); + } + + set(values); + } catch (IOException e) { + e.printStackTrace(); + } + } + +} diff --git a/MetaBlocking/target/MetaBlocking-1.0.jar b/MetaBlocking/target/MetaBlocking-1.0.jar new file mode 100644 index 0000000000000000000000000000000000000000..c38fb73fcc3042910e8e07cf89886b2a03cb956c GIT binary patch literal 472091 zcma%ib986Twr%u_ZKq?~wrzH7+eyc^ZQHh;4m-AOck=qX=iYO_bKkpXy!}`0vDd0y zqiU{IbI)2^K^hbc8VC{+5(us2U82Hhi3bw|2&e!I2nZDj2uM~`MUYlfPK;hwP)<@z zR9S^iR?J1EKW2>qv77i7;T2e1az$YXPOYLzHJ+-v>DN*m?9R?gxeelnkJ&{XlLa;1 z%k)%c`Z30}z$7RaJ>O3*iw7?1dqci;-J&}~FFiAY%;(lzyX*VaN|56r`kPW0*MQ|T zL6(sMihC;hJ9z8mv>x`gaItYeCf9ScGci;TyxDurWkvEX{s?to{Q4QJZ2hfC$RuEU zd|ME3vQnf*q|2Qaj0S?Fva(|#5{hFcjo+$tL||~Y?j8X8L_Rzgvnw1juu_T}8)PES zbEPczk)PF)w{5AyUZU^KSZ%sABT=E|^FL)wX5L6n#^3!9L@vwo+~ahc4(=_5@~3Vo z9^;>la)$4V)6hK{QpuZ2&tZLXl^+H1=~Gk*@zG5=B^97?mZ#=07aRdB!mJh|Jk>l2 zbI4fu^}GQ+?&Sq}!#@n?=)JEiC1b=0(5ZW^oiL=iR2?kQ^%SL?g=s4aG#KSoO8^@r z)n|QhES25n^j@egy2mETVrB2&&~ATzv7Cj}D`TBZr2c@94{9rb_LWoo$#ELA{Vg zFhZ2H1roVa4~H;h>UR{h3SU#k%w~a?8>{wVKKqgNlO|XCP2cTXGi2=;7Y+^lGRq0> z?NKJ^>K=DDhxut+x0g3-5w}w5>ji>>Gz8?|@uTn`$O3`{`VVCN0nEo-+0she|XFO=}PE7 zXZ=6B`YVb2wT97R8<&WnKtQ#BmY(EKSN~i?L01z;12YqK6ALqQXJI=VLkn9IM>->G z11G0pRc(0`LDbI@40KOmumdupFd=O^m@ULe3Iz!XDiZx~Z)}@#Yr*rI)^0d2fKTur z{xY04+->$De1KApJ1~H5bV2%di zssIuR!5BzGwGEWo3U_QF2{)7(s&{$e-0)V4@U8YEz_j@6V83jSP)tlk3Oj?g=vE4C1f?O+lsrO`a%y z_}CL5FMH&Eqbccq03Z^nsBVbCo$d>g%RW8->w-JQU`5;Jq=T=P1NPFE9VtG!c@3$E zeaASKk-|e)3(c$LJ%Fc_J^zG`%kVZ)R*H*LajvM78c}+@B*NoT>8@DSI3!*a6ucgm$Ucd+c`H#o^yEoyj{cyf_cS0mqp3GJ~WUE zVsUjwKdZDNMXnrTcaU1H6eaQw9VX=yF$BeArCKZ6V!j51I39mrAt>aCp=Z@c2&=bW zl4fDXfS%f$_jmij;z4_qL0fP!tz*gwOi88*i+}`@!tgm_mYC>g1sz$lVk#|@513_j zKn(HjOd&teU&-|EarBS$_g@j#X^%tx*ETKiXPc(^Un5LZN?F#x-rmHK}w6RI$uGk2p9K+YlT7^rwN8ktjp0^3Q~@ zrV!Y{0m1T*Gnd~+^&P$H^xZ~~1dydZD`pH6IpiM&lU}@YMhw(ltOq%?TC1D+Hy+wi ze#Pv~tsgHMFXi#MMn9FJHz$0{NA!Ktc&T{~t_0L3(!LT)&^u}S>ogpA>e)|Ng7uDN zQ{|fcYTv>M?bKpk;>^BZH8(c*T)SUpv3do&7h5^C_WOt<$F8czx&=jjHz<+2i#dL9 z!%QhfrPGF+xFXe{-bi3SIWSZ05=MT|a@$$oD<*Cx zE8S-&?e!PsBxVGSy%iZG8yvAVfu$j|(g#R_-fQGZW+Y8An00=So!DlnmabOG6dy?W z)$tU|1v^Wxr^gW1Zo%)^U|UlsI#|6R4T*-VgoUUbFFPL|K~X~tAA)mGy+D`h3;hJh zhoQ(FC+;>L6ue;DO|9b@;TNXhSlYLIqdua`lkzqP`4L&gC={Oxx#uiuOge~;`UXUs zt;s9?azKBItyb*30ma~k)Q`OyA7^7JQgh%|cDNp`T$f0lO<`2w z)kbh}-iBUQWH1CCCA5d|7vlbotA9nDYXtD57c>x15C#ws(f^9Lf57b@IP2Dc^4502 z`}$lUp&yl=FOe137Z`=_xGICCEW|N7?>{+Fv+i7S=p~DNvavu;Osyvz1xIUF{8{dv zw**CnKS*npKIP%ZL@#oS&Itx43P1ynzx&-wV)LsQg5Y&U{+<8F^OsvUzUO`qD3BaF zGmrU*9Dv}qFC^QRWw#KT@5rCHwS6#H4S_F4er%fjyLoD+{P=V>o%6)&0yN*gAf;;u z!?)wviCrhhmBZmvF9pHT3rk+DOr%M+DLlBNO?&=$b~kMT25*x7_MNmhlc-fV+?(@wjLy84cOot;G^Av5XzQCg)Qr zDRGTXn=%Je=M*`}M#{9w*iCJchHVy(7JjPLMVJz1Ee8{Zo+A>S)Lw1Br(-(==M*d; z`S<|`@-!lnQ~<6O-PS|5@TOR!9@z=NLeXZsbbXxqm!sg1B z6Uu7wSuBI<6qz&1a$^#t-l3E6qA=#xj<5qCEhfbkr@U4oB z?9jfI-zs+8WEy2crnQMZ=UI>)OnbK%g7C!Tb(pIKe!P64#o7e|mtxJCK4KO&yVw!E zn~2J7U5VWN9J2Q=cIC|oKY`QgC}S1hO^YC}Tl~5qvRhT;#S53AD#*1M)90#?XxE^* zEH~sWb9WaCp<2$5J~hU+R?P}@qjZ-UCb(&fP^(hC1Xu_Pjof_yW$qO8veR|E?sm`bQXNq;%%s$|cwE;fTx6O@p_ z&hwMr{sR^GoBf_~)=|{urKvo$rultEDb9sxH6z_xx)*}a|G+}_p9RbRYNA$cFmQ{Z_4 z#9btxgcfDVi685ZRfF%8^la#vVIt|ZW|`o9TxF-Xx#6rRY^A-wDpPn*{u>?>MWD0#ez3^u7n?H~GNWhqs04GA%k<)_u~WsUi}rpeJvgG?&uK(z zk_6>aF-r6h*?Inq5h>m~q@8O^TTLIeW6@}v^ItA(t&>|W*p#vrIL#6V(GuHs33BTs zdP8{9LtbL^%l0zT2!@R|9%F4M87l4ig7lCD~2Ts2$(%=r>*>?ODY#~daPbYj+6>e+iAB6m^plMjHAxm+3AaHv@gi|f~ zmD}`J*uG2`kgx8juJ8u%hu6S>>`~B{M2|d|5}#U_g%_SK zV0(ejpgcIP&?kphw;cCMAZabyLV;ePuc3xX4*mdb(~fX-ltWdbZ7c&1N5+lZgYz2L zit|Fe?gAa(=rBdV(?Wc|1&*DSxkbHfrOET%c0u^&m(wbLbFF_XBVW7b}gS zDUIpNjnT~PbQ|uP)H}w5r9cR4s0Ms8Wi(Xs_s#XJ-?^P?f|jLxH4e^k1b@0vp&L=o z3|KtWczXV1vL0|Oa^P~+2iOi^wI$xuRSn<|Y>r}b4`F(tR^D>d=k%%ZL_RkVbmP+R zLmP}H4@6%Q1>x(FA`N1q7vwqgE?m~N9Zw(o93XAfJqFKokM4~MXCbjNND!*Hm? z6ziS#zhtAgZRPn*iW}_v=EiQcBwxP#Er24BMpn}~=XH!fL>jYIcfym&^Ct{>xoiJ7 zXHJhzzv&>DgniYZi(qx@A>NaHO{`)saEBTx%I$B+U5}8iz38oeQQ1O44#`7a#^z4R zhJ5v!l`JypAZ-)+Ftps-JfYGSa1iK`qjk{4{A+a?qSCis9RCyEyz?&hqf- zBc*iwsdrIqTNc$dwQR`;(d>^NR|S0>8?N&%I62)Ih!0-$WIs7;e#KeE{DQwbGrQs> zIAh3NsNC^^u2H@51^l9H)6JJe-5Apo^wAx4fVRaJg98yCploXglFZ{7u`0tZNlHxg z0mS@bjD1OPJQc}fM@dM+kc=`oI^P-c`l;Vca8baPbIE2|u zsKsmD16Aw!)IX&%-Cl1?81Kn3#?6wpLdO*u90;tVg=W@fi#c*FRi&UBMU9^CYK>N+ z8(-+bBWO56C>*YUW)ssznbE>B|*p!bwYjJ6-5>E(>2jG;S`kt6&li!?vzrn+>l018d~Ebyalx%R4&ni z%Z3YuJW+}_S8=P4d?%y137|PS+>F8J|BEjWgYI#i+ak|7LyYc$$m^@|`NzxW;OFxU z{g2-d%znyk#YO~l=I*lMpdmzUW&19$=a~IvI#^aC9CV@#Nc}}Ilse0d(bS3;!#LPA zXDcCA)9xsCgJRxk z)a8nAuK>jsBQ2s_r>3<2d)=i*Vev55WG0i@olZDhCl)imjc3b84Nb^z&P-2u_{>$( zg>jA5gT92rbk6e#ljEZgC~EBq4c%M;mVK^9*O~*x6L;^GM@Pv7*A3Cw8(TGj&?fezYhp6OP%+3L?s`pl`DY4#&abVSY z&Rq-nvioHUi@klbHM_MOU|pQ&XE;l0<;Dr*HCP`#KYp}no2AnYPx>CplBeQkrjti~ z9eIDWa1zA2wr6IuM*455J(j1mO8#6XzaHOz3u7H+2ZvaF zZn5o1Xw06j?aE1WGec&h&-MU?8MD%0+390d=<~wn0TsQ0Bq&`I(X=V3VpBxQWr~Cu zowprVyd-2}V(|63xLbTVEI&s%o;N}XOvIJ~PpPZg1qCJ8BL;nU5YRUhcxSs|gtvSd z>MMPj>N|c3fGdqrkUG$lYVDc7Y5sT-8X(KATE8n{=JLGNLA(dNj3H<}#>m z`0ItkaZ%i=%gCr!jfW=5-dn+SX~*f_+-;2pBbJ@f-@ThQH;matYiFqBdAyJsZ-HZK z^o|dbXPUs+X%~}h+IQ?0u{R;>P}k^~Y6uS*4izM0agPpn*E|W!9(9ysfwf6JNWGk9 zs{NS{yT-Meh-H2g4W>5lmM%jrFWU8KLpbO*cH{w#rMB+{iMAC9>r}V@iCfAF1KIl} zG-;YR%zh)3Vz&k9!eQXDARBC#4C)27og_!sFg$_FpJT2i9}VxMD2AloIX}}Xp)^WO ziFff^8;rTRJxf*+lD=MHnDd@GomH0d`X4RixlVY;tLZZSKp6x9QM-RersDQ#yZ3oR z{NgNUWQO3UsG&+4ywiaFHVpCS}Z1hs08U2vdO*=BCA zKi63(PcJPh62573b>in$cbd^JG?%JvX;Xy}RKTgCBJ5>l%NtaCP~p5&^cbc_cw41+ER;Bq(g$=T|fV&Cw9N&1)!VHZ~T?YF-8=i4xKl z)h*3Wbx(Dz&8aq*`A-U#vfJ!lp>V5u=e51orak+3U(Y?yThB*x>i9gs`e@oT1$YpE z4XS~Fb)A)VJsvE%|KwFoOe<${|JHG{V1<3 zZvQC2TSx>BK4QhsE2~X?$mfORhj#=&-{p{*NLJu1GzZn5^~MPKbEg9z-|+}2$H<=@ zS8*bmF{+c%AL4bt&2QFep5A^Qh0b{|QfFNfSkG~9k|<@D&~{k;+J5s)fc8}XV>;6Y-Y|hT|kJ;$!eFzX=iiCD~=$Cju0+^R@ zzg(uU^#m~*3ulUufp369Mb<5RMPw+}{+-QAktX|2K4j%REuq-E;wuA_+Id{Xg23rb zPv^Sm{t9u`kT+XizLlz>q@QtXJVik+yvtRm(?0!W^~^O`69OWnTBlU1IJiunk26Tq z`WiYJ<-+pF9>AGK+2~7Ak42UF^|AfSHKCFwCavhItz_chi|+K~#y2kKgrxK0=C3@B z=FA~1LVT6lP}iyjgsdAITOJN(>a*yi=*s&fn`rZeoz4BmSz#FLNq#H^LwiHZ#RyF( zCODz`c*_u&=rb}hMNI?tPL%!1cq$o1>YHXsYZK;D1p1x$;bnxvvAO<^o!ATe;Hy)# z`q;lAu(Gr`>KXCXpO{tjwCp>0#dMfwf0c{DO|KBZGOJ(`*UvoAV5>blgr@4-!-_oo z%<--xfuo(@s3wK&Nktb6bS6QKlpx@V*IYygZ$-{;j3QYQA)n5&DEA&LsG(W=p1xh2 z?-DC@cj@(tKbXb*lB94_a*vrY@k~z{X@;xGby4b>SyvG(S4m?J10cNga(vecBhOi> zpVyh2PfYn3Lf)GGRtms07Iq57o)=3RGv(zgu#W^=K>DZ)ipz=0_=bl^jk30M5Vf}V zu|hC;G`2@c-dwYzKq0Us$YjJT$v)w_(9xjO8I`qT890nj9b7{L+D0QEn;-NPhq~b0 zh`A+Itk^dT_My{952i!6Ls?M^;ww3v_OabgUo-+6eU|H2TCVe89;i6XFgcv~Ra@nY zwyA0A!+;H~(`KWerr(sT;n6%fzrV0Bmu{mzI3Cv_DVfy7DlZ=FgB|-Df&5-Cw05PC ze4o&zaYVVg;^s-SACJm_x`mM@O%ap)BRPmpMm_cIO%S z(~n=O4N>LYOtyL}GeL`FN?J^o2z?GVFy$1BSPhvmWO9^k#vEtsE-e? z+xh*5_>k6kJl|wjQA>J?2HJPe;bc)mJ~i4p?C@7j+D$g+6y(fdQ)wAfGbn5L)xIOC z3u`uc!+Xu_A@sUn6&(__S#-tI9JW1@hy6#`&xG&Lg9&%@0S2G29{mdHO2Y*MVs-f& zO>|f-%6A!-wu-MMBl?ybia2rPP{^@{6LO=FoD^-e>OtlCR*-j6itNhQqM^)X^OW=! z9Np3uG)rwNsLw4bXXUrDWvBJMNlA1arqg0ZI2M(*Czd?jysq}wVyAq%y690d;4QNG zvq|gBbKG#e=TjLW2~X|F6@sm~@iH<-@E%LHWPF31#ZS^tp)q0Dcq}1PM?7Q6BoN40 zxV~KF3ADwh>QZYK_Hk)zhRYXta#~6D?$FxHoJo9YjhWiY%bQE?Z^)X$n??w+85LCs z(`iM;Ed?BG7Fk+!DprwWjf7J;4TPli$#?4y2!u_d(~`GHhl||JWpk-lIsn=^PUQXW zhT}cFb^4QwHbdp-mZwP17qd|rtSK)!TI`5cGwr?%p;%|JL}HuoN2w|u^jX?xS$FZx z#vu|@`4}e`9?=JH0sSMCyjVyor6?{Cw4@oW)@MtPkk|IjU`x=dj4E8KqHxNqB{P8= z(KV38>Lbbr8d4${r0+9R;=>u~!CCmO{WT*o1#3w?vL1CVs2Q2>QVth08^p|<6c{hI z%s2Or(nEZLb!6`Ls}feZt0S4su2NER1WP^|0=ci3`X_#9=)Mb}4<#-A-pcXw>e97A;FzpiY@1eHk%>^L99akC@ghEzhz24zL4?iSNjsYH>OOWH}x;hI}mxGE(=2 z5Zw=^EB-xE7kS|zWtLiJP9;0op&anfj1nIH1(FK#1L)}kmJ6_02Hs65wZsll+wmkZ zLtMs?X`&!46D|8Uh$GN?CeT~Olq$tLnF7s#QAOeNPd^_`=;j_x$|-or21NUb8I)=| ztI89M-!+46VylJOc2YIj8T)A;?lU|jLb^u@_?Q!afGXvbrqXzJz*BqG`E z8I3eB?M)%*R{8urL!cBY5fM!sKq9ehM=klZ2JDfKi+=t*d+NLvX}-AYbkd6HlQu=2 zA}2`6Mlm<$0E@j~5^pV3YT1k>OK4sUDIBe@+HUoRr3NA5tZrYjv3YdNal+E7eDUik z2IX5#?Z1H~IAl~g-tg_4{MbU(@~#~F?d#|(z5HeD+#G(w15=-E$EA=W1J`l`rd188 zVsEXZ7_F12=nO}2i1sQO(W?+4{-j|dAD<-EE>v@{T+0{aM>CG!yWTpo_5f6bx*S8y z5Yr++D+|y}vQpMykNH(p=gX;=|6?6Y*J?O^+$vUP8JAT#aP*+tyx_Q<1BMng{n!97 zQ>tQF45W2ls;gh1|hXat^7CJ#G#^P9fQo)8zwEL3Vnx31W+AFVLqA)}`9F4%U^u z(;Mg;b>WM357yV@6Jp3n1O> zg&g@uA9u?5{N5#ZA4?UXC6i$E!92GB^Ray}eixlGocfM-t7ST(QN%pGPd!WLxslbr zZJWQiqdw);g+mFpB~9#0065XOe0XA`<(1qnB(FDUmkIVY`s8vfM~4F?DscS?L1`7Vsn6)qo_`@6(b1*<=wQ#Vvj|c9Q^S>vV8SDXccDELiqz zUY@7=Bl}mYQ3VyLh*)WBS5`k)H{(ApYsE$+zx-$vhi4=CyqMKvyMEbv?m74Q#p7_) zp9qu^fkw4vuh1W=+GbyeY2Yo=PlzU(J}vI^l7`?``N9ET`2uzo-zWc8Cn|KM>OjCp z8Y(Y+^BbD4kjO8P4>$fE)bXVXH|EU&)(1MK@~4A>BiEU5Zv-$8MJS4+`&j5~xrSdN zG+(VDo80&`#)tU;#&35_y@a|z$Hy>&*gZ!6o~Y5oo1lPNiA8eKGxf;1m|96&c}n-S z(Ar9k0J}1cP`io!KcWdElI&Fvzr1`Go9GTc~w?1YtQ)3j_oj10M(kQ!;t`l?wy;V1sW z*;xxaxt14=w!W>wL5fHRv#G7LR!e{SuL(CCM(VXJGhwWWbvNwJXoaaTx8OCDTRKAz zbxnuK9)-T|+O4X63t3@G0}Za!wzd+_&Kcv=30VSpyhx0O+LBUNjRLWoHt`n^uEZVd zVd7KfW$lmi&1xHA%`Glf?4DlJB$}w=$hUZE>RjVEPF64y+eQy$9<8)5&MT6W9w50`t+ZZx98?OH&krr8uD50F>KWaA06Q}pq*d039En$5)r3#I1pcAbSE9ArHO2MVO9LLDN3C(=C$(-)XQZ zT7@R={kyFoBN6|%@bz?=M9n~SQ}(s9q*Ywot_FjneD=F8k{`CB#tId*(zs#Ct1PfK zI*J{+IfVKP{6e&*LuZVJRw(26!q|T_9U=nRaFTfkaS6N0gd`WdrD|*}+ILSUBxbk8 z$ZG5-j;5?tQC2#V$CaSzCQG5y9o!iD)nIY0iZ~S%WSx1c)4J#}Iwc1cSL6eyDn{tf zyq#F+&!Rl({@ipgG@e-z+0NFh=(O&ZVbtOs`YDV*6TVTG?S1?3MO~o>&Me13ztTJ? z(^~IDzFJEIzsd_0?rZa+wpO_G$ZiIp|33f2|(|@|*rv}ZkVSNAw%b(LWurSmWw6sOaJ*jz|RX zn3=O#yPHOrBKy60F@?xLiFaWiGcyKF$_?_ELQwZHyFX_*5M|M~(q`vDmiAptysPOF zXL|~sE8aSJgV>3B&?FVT(U(uYU2BRO!cQ$688rc6iTLq(JXH@To zoe{lQ*U(a3Bfw9a%QRD?zZ*0(b`+3Yc+)7lqw~VY9q-jH*$jB3mP==|s?U~-&WBG_ zj!z5#o1u5st!14N!{-!-E(#I~**u1oNb`Hg;RF zDPm(l;4&=;%4uuFdR6jx;-j`T>xDz3URqnA#(7DQZnf6>wdOdT@f1Cs>U7(^AuEt^;<$kSxz#a2Y{Q_5m(KSY8IeogjI819eH-Ry~EeH4};<=a1G&$ z(R?H2A-{>hNW|QOm4p|3`K-%{x{0~*>Eu!~!j#M6Zuv^|@Nf4z@P%2Hu5<5Ur z4~Awg5EIU`ppfy)h6piPeF>NTA^RF@Mf2tuEN`39hj%kf{v*)s=0&i;ym9e@V273x zAroDzc2pSscYHJmRe7(0XgY@%^2OA6;kvtf&19o~9Y^*zA`o@+(IrZhnZ_)Vxk;)| z`0dz+s3gfd_Ee;v;o9_Ag2giQOOr)UmGF7ezaA3u_-~}#ek&?uz`Wm9`06aG6fe-L zp_FX;k}uoOj1yEP4b;XDKrjzH=cgY>)J#y*I`8l`(Ip{zSOHmWa`3D;yi7TXXN@Ju zeE^}`R}HF6l~U6W5q*A-Ld0`n#B+mkEdZvZ8OCt{wjE)@J@95v*{8V07lFYSqs|%9 z>hg1ON5X(1SQA0KG=yG`1gv~ZF)GyPuT3;%n=;U$TyscX39uT%*P({*Lwqh!&N5zq zgDu$eNTWk=ZJU@otPpLr!lLHn&C~si{I}~+yZ{>Kc7*Ogz`+-KSBN`l5>fD< zVn0KKd(nu}BoJQd?BiR z$f>~RzBrR|;UeUs9P~PoyIuD{ zZBBlPZMa0!??dLKbNh#%4(~o$r+)w`$5qKfOsF@mZHy;ZS;=}Y6U`hx-pdxmXgU~sS!*z%JGVR3VX zQGC4Y{dSgitLqdp#&#?moG5X$!{QTE0B|)?!0c{Hv&<2yC{{#XfSm|ffBu~U5>k4v zlifVVsTTk1#;xKSO{v8616iCYjG<|&ZZ$If%Ar6Lp zq29$PGj8*Yy#$7ly_Nd~p*~Xr@Di@pmCl6my~UM$)cXgidP1)!50NYe3EO$j`Voq# z_>)}ukA`D!+5*T`uRgnKrHjb$C7qX>(k-@VP;Z=Z!k;8~CcWlFV&;t9s1n$i#)JB9 ztH+(qWJ%Bn12bwQXwqetFvl~R-_#P5GHCT}8rs(x03~SRwo<0tdv6_024XLy>Fn6Q zh{HqNnL5(7BG82l#nv{|{Ms2|DTM#{AFmLyJ&O**+B9<;_Kv=XDl z_AVDybM6I7S54x?+gOp;ktIbQHDrzMD@??QkoAxxiyv4HQ^gX)qs7eFWFIV4MU$># zisnglaxTnCj)bJ4B#G_sR#sFp5&bhG9ZI;@w2_bem}B;~=15oNx6-0UAIs8Z2mnse zIZ`FE*tD{U;M;3qCMhKg@iri680c`|yaw1uEh%Y!+ZS@I>!l(x+Y@f5WBdx~vgxeR z?y{Bf@%cL~!UpbPtMs$$$lWPbHjeFRnIsoUf|#u_(_GCNOz@>pZ7nOeXL4FKh-LV_ ztrRTTZqg18#6F`pi=iEa3`3qHsMkKA&tfWMIhP{BwrGcJx|r3I24@L}D7}CUey<)qcCY+vSM>8- zKy^J~)mcT)bj-K{bBpRoO5M_YQ0pmCan;;>7+NjCfueAir%L6<3a+<*0eeO;(i$E7 zy(*2|pjY)KDr9?C6K>L>CS-e`6RxsOYY@8%ox{|)+jb25Hm!;xIbCIg!K5_T+VwQC zdVo=L@ct!GNB8Fq(dU@)oINHzR1$*e)>>0dZQQ1m&J^za4f%)Wpx#YGi24nc)q}O8 z2i%WhD{zad3Q}dYjw3tl&zN7VAE0&&XsjP7c9t&~TM2Go%Qr2?E-@GXr_y zd1Z(k7E?F#_E2=iX5v`rU?s8iivY4{r1LC1F(f9J6G(N|En`~C%`j_|OI)>()pqd^ zgkvXsDsmxWBw2QJ@Q#W6H+J%dO_)^eEyl!G zsM?CN0iq{V|8Xc@gQFkOvNTV8fk>=AQINR$ng--IuMa{crlNyu#!NGa%F(drLHLX= zJ+q7XISpcO@yF<|I>l3^F0X<=`d3JX9cv`Pg%=cfA$&>Qjh5-`c=warVxqit$SK#V zk=8s1k#6}thnn_DK8!hmQ!t3-x_{tZL==_lH_9hQFDCX z1Ok8Mw_o`zsB$en{^nzU5uB)v(R$Ub;Ho`zsquLMY_M!rXyA(1y(%xkIH_h zoe$Y&s1QFc-8~~1>UPs#K)t^?NI}6Dq>yj>bM+R|Awmaz%xKW(z`qOmui+m3N``AI z6-OcDn=S8OI4OzC?F`5*(@8J3%QkM!(}e?M;haqf9}@ncw6N? zk$Sg(vsuxAp-Rl4Nyq@Rk#$_|Hbgq8 zbC2PX2=L816p-iqoqPplpWU<`;e;jR#-!t?$pP<#DZ!dHDZmL)%3yAZ)bDLQP(HT9 zq7%)v5e8_B-0pM$e?q(l&I%XSh$44D&*?{wsiW)?xEO@J{1uNTXN^?6LM609Mz-rX z=XY5E$2|nDL&&x(RgcF#?9wK+;a@rI(&V$>vPu#9MjGBhJwy1BoO1FiV z+ak95UGB{GygkvvZn8Jx&Zb;lxq{Y;+~oV7y0@Q&f6CvsYInE*1bJEn z2f2B?cDo{gUj%}zAlF06_QJr45*`ukjIWT*JY4JSJDxBLlJuOZ(IK9J&Se2wk`6)5 zoE%(4U&o3z2j`;#E&QM-HHDOnW9wqHd*MiLcfqz(&ykt-jjBofDlJKekAoZIAbRp_ z4FajEvs_`MjR*zCa%qS5r~1Hd+S3LDabk)ou`Pl#F+A7WH%RC2iX*HqPaxA5j*}@t zU!Z@{(!Z0>f2E}i+ZkmnfBvX+Lirz6Z-0s{{zE$c2Q8H|ar>ho|5wq4hr5b8+Slfk z@z&IVi7BRg9}um!KZ$^laJCc(B#D0rcmUXdDJ?y7?9JFz*XdMdoq2VOLUCpbU9)~= z0ou7*xv^gZYN}efnzr_@j&-fn+U?`sWunikOsoKgpqJHIuq)o@E~i=7AIBS+W?zp7 zlt50m?EE#O_yD&zMSosCky}9k!ChfP-OBm!cgLRfNUyH8U$O5u2K;;h@1*y@^jf6H zg^st3^eGds`?@su`zY=TbbRGIg#07=u^8_MU3&1j34?>G?qInoVgyfn%)Z{}5U3B5 zN5@?+T{`?{qZohWYK(68so2w(1cdt=6UtjO{+dDc;*?sC+cpBdQ9}BhxQ-q|0D|;2 zL`B>v5d-@DH;Yx#J79&1Jyilgh+c6Ul@1@)$H&D~{>4+Pci6 z>u?LUr0}=q>*k0LSKqSUM-RmNJgX`0le3_XJr4{RqE z+0fW6XAYB6>}hI)*hZn#*_Qo%*p;*vYaj~~Fs)kcIYISwMr&MVwv`d2u)NJA2E%VB z#pdMXwx$=^q;h3?@@cmz_qG!GhafqxrV~vV&~28Moq!EmwJbUa?y01Drql~YCaWS} ztRl1pl+6w5U!o=JTb4?*L-4#?wWWq)UHQc}OG?bee z(9e@M1Nx@<~r6&mwOx9sP$NhVEt^LHk2>%FI7BmN>vh&?j_URz0g? zN%zv+v}P6AGytWjJjMFm0Yw1|*4AL+>oIBZEwam%0d`E}-%ahLTC;^bKfeIat=~=z z(4)RP2?jeqSr@(@F=e%O$j$2L#*{X#g&iA?AoFUojRmuO%wNf<$#y`h~gzB!fu}% zI!Emmwx`mNa4Ub$i=(gr&4wzaK(@b;swbA9Od|vbY(!^+DD_}UMCU0`an*UC>2R8R ze9KbCal$hp;`m0G^Fw0z-rXNiv^VM=KVoMHSMCnQuXwABWjRa({RPXfeglKnJ2XkE zoXQ5rHe3FvTq>8$>8Utedt-&x+izz=(wkV~D}M?1M*-i9bq{`^L93PO=ig^xaAUKR&h1O;{hNGhYb^|og`2r1zR)#gd$cn+|V;Q1tZbIB$HotO7&!?pu z4j;*Bam}-R3P$$*bb@@q6E0_A-HNVTk^XQorPGz(zQoUzFYSe80=qa_P0Wop*AZXq z2>6bRlK7|YBsC?&*j1fz5}&w(KV797-_QYJk-PLX9l1SJr3TtG#yAG|fx?~cI+e)} z71`CrmxhpO>WLW>`h7ZW(_-Z)5atOV`uCaJ5@t2KkulR^e;xUjlR&iv-Q{qkKpxZe zlVqSfE)VAfy=D5N9?zwoWcOv4_eUtLu8Yhs-adh>3Tv7 z)@qZ+1G3(A4;Id3_c;%pC-Caa5#!HQCteL9g%>xZPPPMtb>>m%)%Wc5=*{%is%#Gn z2h+-r-Ix3ic+!$G7r@EtNu$$4g=!??!wKqJ2afgma}LJ2c9 z`(pGKBT_C-n5qHp$cpPc9l-HXN1Yo`Pe;1Jk2WfVM~Vsv#2u!S31SvMB$7dZBgwd; zD4`8dA%tvfVCX(<4C~K78_*d;Vy|r4VH;zIe#J>hky>G74CHFXDI1<6?`TzYQS58j zJI4q^NByn@{=VrEBVR1ksJ$w{^w7b@-4H)9gNiu0v;bdS#X+>YrcC}m>xTW6byiZw zwjx4>tsLo5rD+Wf&K$V<4d{)y0^QsiT5MyKA(-8nfk%}JO7tUx9@&w#Ipx#L|I4c92A>gRgDluAIC<}wl93`^MKNYsqSBp=UTJh$zu%A?-qfC?@wqCw0ij4d}=9$VN0)Mx>`z!PZql z+EoFa;a?D8A0I+rg+fC}Gvwhpi%H%gecnL!U9f?&0W@xKWI87^zIn1z{DMnFlo@fz zFmdSE_K{)4#MgoM# z`3luyC6>ZHe@bFgnY64)dUgwa;nXX`X$wSZ=?`_^SNXdJ!G|91LlP&1#&icz zs8idjJ%C+EsvMyyHgK&nF=LUOkpZ2#lM8^h0^ z3XP0tgoL>Rui+IHUCMU>wlzo!cvi^{Rt;mbum1Tg`_Sz#@i|yMZ1Tjrb=(yG9E#q! z#>X$=l1nC`B{ttxx8@2uu!v69_5y1{|RBTHdgN6AokxB_Uii&h3)(o zg^?!0!s_QW(1{a31#zZ;YK{T_17QJvXgVf%i5O%z(M-0$0$`fu*3~N2(C8M+OA(gI z65te(OT`*ncDfp=vG^a$e0dQs-L@uTQsi+ym!o|rJC3tl&sNW#Gu@7lzh`y9>!5w_ z4rFlTZ+-YA?1n^VKA`b#3z=JA{4gp%*drj0_5*de4~GXGKEm+v2MQBj_32i`XG9P8 z8R#|+-{t8xK3;zE(qhC?y#(Pi*$+j^PP{5Qu#<%z7{4&*#HGY~+7HAgx+w_s8u^Sw zmLC3X^znkveI^V*8u!_VO@0a3`WuCz`*48Tb2yw@{C&(%?bQgW-MfI!d0f0?L;ci7qCz= zFl{gE72g+#c5#`fbZFxDT<2w?y)9L-ZJk8SQsRA1EH1m_)4%wz1S3ZFr z$Ij;Ugws*y^&>4YG#7A+l2!`|;ylaNWFc%95U1I67U^$MZe+vSk(Lob%w?pCTfOt= zmW4)<2a#LEb-Pas4>>sgqY>sN4IW(5ZzdHs@$cg_M?dCGNRd63S`=F2wkpW0ld4Oz z5Q79retvIGlq9gx(^@)zD!rF1ch}lUp*`s}3^u`ewk<}TLAdjCu*-PPzcAjV`PD-{duBOp36$0?X7rERo1Ua$&H_ z{$#>abF?P0G?5mgud8Dm^(N_pGH($hKbCCSY6S3hSjR+3yZikUR|efKBhBH^QU{+? zM>EWIQt4q$!--F_kKv{%#dMtoElZxhLa#Fb2uvN%YqY;CrwIq8J#tQP@-GuTym1d3BahS^Wc-Wu=i5W_FDs^XcSpoWJ&wAAXj)0j1ceKiR>doqY zPpPW7E*NOqingFhfJU9jaI3xWFlSSR*{{m)KC2hRZ{_cHtp9oVRV|Ali9y3kX=x(V zV9M$_cc*Ud$VquYWA{N2zhJT$sJ0L|}FV1+y4ceP3E?iBp zOHfnYaF?JKw$_&OFRK#MN|U8#RVe47AMX49cgT)vQxc}n)1Sj^eOI*R z0c17z=o$+58(eiQYt3$|Nd1&HxF^o=lU}fQ>xnBcm2wQ^FH-k&!Z|Fp2}Hoz&0W@j zku~iUsnA*Ztis5O55G&58&xMwM#;&g%Ae-xu$IASP{5!mr zYP;U!{7dvLXQk%4#nTfsW&3!WqlW$vWSzdSyk%h1;IA7tu8B#VaUpDWtsYiKVcKs) z^_&!t7ne=3B5by1o5rdj*sQhnc;8C-ew$CKfc$7=lbWU~P*>?r=JDqk#RQZ;JZE6) zn~|^Y;|}DYJFvlgrfczpUu)hRDPuF&Q&yWZQ#zMZ*01T=X^ok0?&5Y{*s7VkL|{`j zTrl#kK%`u=11!*N;|d8m@)V`n)7dyw6tGH)!#JSy7})I!j+^#$9qcfGtt{0K#3Kpz z--@l&r_%#$(0t_$c^Sg597<6{ea6L9krncw;xd!*i{c@F1erwSc}#rJxp6QCV@V+m zDbe%6qsxq!oMWh%9FG6gh*e-~o<_xGXVxrIu2P!VV10OTUiSE8K82{Ly+~d;66msWzvDPHU|)czz_-PNjfU z0ht`DM^P05$jAWEe|D!n#KnQ6O^^T#e#X&homn`T*1zDSI88baioa|j9iehxhp}n7 zuj5841y2h0$Sn)|#!?M{233gAYc^yE_S+d#obGIru;If+D6)OJERn~k#JQ0idIRso zIRH2P#Yy=)TR0NE7*)br&_Q&R|0iw~7rTEwbKb#pELB)Ljh>{K_?kOhj7N#=Z(4$HaCJ+*kQCS0;lK=+M zNM;pT!k(`gDcYYRN!X{RyeNy;;vgi9Ghe)?v^& zj#7o~)y~}^T%st4MRJ^CMAHkQJVn}&49UPo5YrD2)(tWE_#$`w6xaaTR+bu7tO~~4 zfL(B)YtiQdn78^nQB|BS^IqzegO^!?>B6^QSJ+UZx}q zpJzwa4CP-Bb{c^tZ2X$tR-=Yre7m9$&i*Na-|Qww7lLTa4XqN!Y#7lWsZ1n}udz31 z!QjXKS6$Fj7!~}3#_*6%kFU2v4E21yo<&TPjDrHl{bqFeV#YJvPE^eeewN<1Ok43+)^VXF5@y?A?me;J#Tf|-E|SEj*dSZ?)?ck54|D-5Gu+>VhDIiP7o&d zR5-TK2Sn{U1GtoM{BZ#?qt*gj<@<$glJJ2JFKUSRs`Woyzi}yobBl!SIcP3OHpGtZ zpLFD83oT61Dcfy5 z#8*~h8Zq#uWSZ6Cq*=vIBLJ+bEDl6-(4#;X2 z#Wbdv#^t%aI(@%Q$SD|-X>jD!eF?MNw&Cba)7@B9wzvSFC(0L-!foDbVk&n z6~XK7vap7k+CxKDT!BNc;NEd`I;a-1WwieBa&T7b?ada*^*Wn!}@9kx0I~fJ~ zvIRw}b}nFn8P1`|I=4W1bcJf5(Qx(CmWuAIq{y!b?b;z~2wlNm*;R5on%WDxr)C!$ zx~E*PFhEp|O8s@Cm24`?aQ!SRz;-@KTSa!mZcE7b&-YEVwveziZC%DY{cgqSSw4ys zkG$4Z!2NR0?qNlPDP1H6HU5N{Jf}Pd{aq? zGP|bUaZwVRJLYC`4+1Nv4buv}L%HY%N zg&}9*7KOWd5A!|g@fB3z6*K2Zk|E?kg*LqgD`)T?%O^&c9)#r^ofdAt(aGXd6RtN= z$tRZ2?1lJ~q}jtG$kE-$@`dTMV6PVW#lDMQ0E_hUx7o{Gvq1mv@8c<*#S6KP!Y$qS zyD{9idj#tN8a+)`kR?0I@FKz7ZyleJ!=4J&F zY~7$S&mGTJ`M{J++XoLF(Uze`2{$vzhhdACIohftW5-gfHaXwPzP2BY|o)MOc>O_7b#4Tt&7#n(^LN)qScddUwZ?+-A% z3b(9rUiPahI+qNlE-yxO;d`?14*7(|2q-5!xHwC#?T(oiMOdok2?lNHL58yojz^S? zoN#GGrl6b(19)qS=EYK(nEhl^s*&Y@eM{nBhQjTqg;(wrXEhSw`fJGBsfjgDD2Cme zgB^I;ap0jbxsPe%Hw(qon_7QtQ%FV#APxk9eTo+mS2yIbvp*&jhCD3kSxosL^P z^o9D61^RRV*1w^hIsb~)>H1nAUx7Hmp2f<3@Z+mMc!l(dD0um(bez)DksIE}9Ye51yU-C|`!LNy(A z@PAnLBY)+O)hPJXuPOwDI(_LVDL!SWRr#62K$p=F@J5}Xz1Q#soQSq4<;eNraaH&g zYNBeJL!I|~Pb>GBu+)rHg4+kntm5oMc0_ly%;w?@OnrX}b0EjsafM-e@UQM8K{Y`%hA?yjW4DM7@j&FPQXS~y zo%!oe-QCF@?LE?_Pv}pf{T$$?T&0gOb;l7>dj1{i;r#RdV@*66tEfxk>=TIE4C+FF zhdT-dP=^oG4s&A}^O8|`XU4rR3vW=07bBVpt+u4aW07-}5oQU-JR(z+zK>KRaw)V| zq3tf4aG0U7Xuwk29v9Mq$)+%WgK17Jx+La*otf>wflOECk&Z9f;s3|UxF$V6nS zmrPIu4Tuv?kJaHwJ3ydnDkv30N;C7IqwNXXg^jafjX+e%SWvIeksrpi_9`$^q*QDE z)&;L*(OefWe@rC^@1qw{QgSJD!Dc1}Hcc`eI-pmWWtZNf*D!CK${UqSr>Sg6W{T6? z8@0juC-8zSl~TO@Fs7T}=)0u()0^}jsnwcb8Nd3&-TL4ocn*Hfnh-3NvQ4WW4Z23j zE;*o*a)sp`>iix$M%nHPh?&5HJ}cltgaAruzzx$Gg5qdzyeDWvmHr=t$74Q!Z6dgG zoty!VUT`=rR@@h|8Is!~N#C$JYS$k-!KiuK01q{TfUbwR$6~==Tw7<$i_MDEMunZe zWa=pz62POp* z$Gh0GbGrX^#r!w_{U;gD-Ix9ad@ska z{k!~D{@=MENvmHVE#|^-GW87k%#?=abr43;MA2%QWYXIul2{u~;)|r_Tcl??SkwfF z5~)t&t4>@WAb<07|5ocRNLFILU%h1Em(tTG^$ z0k9Vrz~4X-hQFb8sS5MROpJpFa8tZJgUuK26L#s!lmx61y&-_svxFf?{2c`F1|dl9 zi=uid4I5;^?jx=HO0mXH96N!0%j?Qc`_2gP6z{~nB>MXacxgg&QcuNgf zcK0Qj6Vn&HGgESiok>l>Sx))Ka~K1lR(;S7w|_4u42X zf7`(V(Iii%2lrITQ)dYKj~JG)@b0O#-|amFahO97fg@mxMjNe+MU#hS1FL#~dbz|% zRtLK1bDJN+HIR`Y1d3Dj0J=n*LY5)hp-Imf5Dd<`$686zKp z6SbLeC0au~QzHO-UX{-${Nge(neZ}G3~A0$19scsbcb3s88>dIV~Wt{SudpY$ty6y+6*EE|k#t21v(<40Y~MB-1Es0Sp2-z3_=Jvo!819nQlZTqLrH;p-DIY>31*cB zv)9P#&P)D~9ALuPj+h+314}L0L3E_oDi!TXpDNsX!_Dqhh2T9{tOe+xS4Lx*$f?-L z47zL0)uGuUwHe!hZRs>LV5;MwW8AAx#;2M^GU>OpaqJZAxoag&lfu#MGB%10BG?5l z+~^aTzYzKi1=Qzw@r}r$c9rdwzNkx2J`M*4H#+~aw5)u1j$pnyQFx-Gpe^4eh3g%v zWq!x{r=bPcJ2;8;4S)2WEbmCZ2J#tfbyzzw0{+LZHT?d~Fd%t-6PW~YNExz|)vG=7 z8|WFoa(<*I3VCr%!QtF6r4FHp!F8)lz2KNhjz?LxuQyL$m~S~j3Z~q(T+oduan%Cj zbAYmXQtQf^KX+&mH#P}qZs6<>mYS)IbcAlXykaGYW*j^&>8;vl7a_Z z@3j-&Gm{rMduAXB<9uY9Cx86xgMR|H9p0>HxIe5k?Bzv*nITA0!%*}#W}f0d zpx43_VYiABo>X(ftH)%XKGztKCDkM4vFa!C=*q&EznD8e*HVbrT&5JIyev{7gdwpm za?!;=y$<{t_C{yGpgnIWInidUjX7;RM5kwtyOLT@-O?4s=CnF`LFWqQr5%CSR^$Tl z*axFFj~lV-T5132pc+^!xyaJ3kM%sFtbK6_KRQ^S?4V16x0LKQw0cHYc*Ly&H%y%R zkzO<6CZ_r@B6(g{OI0EIIATtnSR})P6?-cG)h{#7g@!h(^>L9zRKg6O(-XG*W>|e< zXW1%2yt>o`{+@2m{g8oTw3^JgGGbH6k;7+jUg(6{V5#u8apsKl)nt$vq312h@hVUS#%p#K%Tf92GLFx#F&T&+WI0Gg_K zJm(kqt}-)an{1fxp39wVk&Xb*7v`=+G{qk?W7?F0%`|0Ejs%WvY54Ti`JoOfFJx^2 zQPHfm-}ITSK&^Pp+KSi(%Lh5hmz5K|AqO_Tz_v!U4%LZs8&0P0zyCSg{hKZRlRAin zcBf#zb<6wTQggz8Q-{p|Sx1qG+t@if3F$lO|L>b!u!8=7JC@mNXoNM3o0?UW@-#@6 zW#dA?Ed>Shkw_VQG-6O{InW)p?mDe9)e7ey8u=#JJ%vHcK|05+OirdV**fpvW}kI` zJlzpNqs2NE-KqW38dSvPWLz*J4m(4gk`R&^r2SR3z|PCzrD@H2E*)Z~4C{5n^2{D6 zk2c^y1&xekVwe9pcGsiO0hJh=TRsJG#%iMl7 z#mZ1Z`8J2tknL2=aU8wHwXe=ave}|Vczwq-(<6fQs%AIm201Yy-d}9oX=MC0e2aMR zS%S$JPfojg{aJg_!%)&c@{8XyVWh9&eW%Ue)U8H8mXIZIx)1ONd?lU{aS(ZMp$Rnv zZ1D`H!htO@J8s=9SEEE!Ld7PIsVl=PN>3@G^<&tn*r1*Npmeb+WJ0_gEUb1?=Tt-( zl}gNS58j>2X`(nc&|CNvdIIB*7~#4dq9l?Jnx)~pWKe)6PyTU_{vLG2P-w~R^BC4D z#g-^z*kWN^GAtNH6A(>nr+xVCdXy6!wgM+B$P_w}8~>BW7%vNv>N&&`0+e*S&>^t= zBf0woNAwuj+IqB162mep9{347I!H3Jh8u8U);r8UllAZU`M;C(kBjcE16Tv;d$K^j zMc$z@knHyo&ICGqQ9qWK@OEC)`iMxJETo|y~Ht0A{`#%0mVoFlXj64-JWCw z;aDEvLK&Qt^82V|2s7Fw_d|Z<@SD7=aOOH1jP* zMAcjV;D{)C4^$=qsk8qWU`;QLhuy1DbN%&pt1Lx#`R9_wt}BRd=v$V+rla;G6<614 zqX+B-xb0KBrN;EuXTRmnm4?odG0jJ7L0?|c#cb3dG(+)@?2p`QM{>livnXCD9a`56 z^uaFu&J~|I8?_zH9XpDGAfe@0SXrhKq?`M!!EKeAW6zdNz$uCxc~s1JHs`ZdKYL!X z_D+k?H8QC=-A2uoEfaP0SzGC9`No&gqE&sywnIC<+YpJd5RnFRwt&4)S<627)qb)< zYk8~21!#_WcU|z<3!B+LqzsPAj_E_ z%mK#L!yL|>*Ae)5E2GbK>; zaj>=ievRS#AqvOuhb{g)b?|?RNnJJ$j*Frju(SvAtSiOag^YPIE;02k*toda7pFJ~ zgwod=o5kYNHY4D$5H4ezaA-~d18*t0d7ujoa7O@ez&^zPWn=W!0`hiMoxT4dd?ei=0Q%iDaU*2$wlXx^fMM4s0=t?SzY z@!{;3u=kg;0Br#9+TQT!;irAOmmsa{>z(*k^{yfSQ2#5TZ88!FEqBE9@Q-@C*G%L# zKxW_sZHxHXQ`rmXJ5V6mCflES^ofR6Y7mS6d;o-xejp(3RGVZcmvg0PmmWY!7oxnh zPbhBCd-zd@5j(aDq2z@Iq3i{xn>xp)#2sNPu{25&KHBXjIPplsmouaYXnt)8xS?>< z=yyf)lI6FaRP)BH>^#+>d9K1Yl0hOq&17ezA#=Bzmfb-Y0 zlr>T;qW!KUEjMdY?l42M!z${TSTZgvW1K}IHr8+8N)BkGGFFS3Zq>3L)2(1-(in7jSdq*UG3jeWf@SSxz~``%8lo09Eld5x_^#bWFKLLYL#KR~ zjd0_WD0>}4X)8CSV6rx_!mT4n3Ph@|1C}d^yuC#%+yU#GNtaiWm`j`@E{SU?x&AQc5|HTPjhvZp66502Idw&L&FCSZsN! zbWZ4FtU|iLE$y*EZG4@SrKF@(-q87#X3B@eP}H=O zyeT*4wiuTxI&ZOr2*&#DXM`3z8n~<@V`4NUjc4W&35XF;yD_DN+{0MoqUbCnCmR;( zQa2^??3`v!QFSN()zb7Yhw?1k2+?adL#UT%!EDLK3L|PJozbv;@7>#OOqK=r$aG|C zDt;T5MaB*^P3&(`cg7Tn6$kJu28Zm6RhG-W1mG;p8gZ`Ci8@43yAg@SllbmvT*+1l zIg6F`v-RP~G0${DMcv7&zdcQX8pkq4Sa?5QEo!W^aRjq`O*0(Btd(d!n#AV9bhSB~ z`Uu)5Zx<7cPsy%60H5-O;ewoMyJ1?saF#I3UE=K$)`AA{r%FlYKB{eK@jAeV&SmD>7S5B9El`Z5<!*46K|{7%9l_@25!Mqr&!PR^y+QMoh@5jIzw{ z&~)Z6Eb!(pa6XfQOsPsbBR79+q3>k}Z|)$#e3x#aKZleSZ%yBWaSfB&wA+oIEMBNS z$*Lb*O$tI(Nk4pdS>V2gE))cOK4EMR(YPB5>Z@k6_!7TJqA^(BHGACPZ51v#UqAVv zcNXXe{!j*y;S7{RrP@&q{KEWp3)J443!?e#&yxCKsN*vmfHOPkW%p4gN`V3|E5cYL zWRO&>r;+KqXza18@L)Rlu*1l*#72&swF#MM<0`hVH;shiaeb{pXFqcXwiRF5Wuuj4 zSm7B!+%Ud$ae5t=$tzm@y%pmkIZbj`^oW;C7_on8|Le-^Rhv9TrHM%kh@i8!Q_+Xe z*F``}3;~Zhbf&W$#bYNjBvuasF5!;X6}=l2Y(yGmZ(Pnt>_ zgB2}gSVZyWIkTQOjQf&KybVSPzIXGXFfk4cvi{bhZ3|DgsA}GO8EBIkbebC*D^5P0 zeOu!`2&g1QS?^Oq%*#o+3jml$wUn_3)|#X$RH}6B%>hHJ#MR2Nbev8U6;YQ6p8B** zQIJND1b$wrhJv~L%9I*uvV(_Ii<0y+k)ecSYa96!G%m~Xb_!ickx5UDV?$2OA0}wa zk7M?$qh7PC)xmpr&j-DlF|a2o$t=@`779<#fl(jFf?PPC>U-;P7sQF`*LH#&i;P2B z55Gcrk~ia-X~W55HsL?BY=?*-yFhF{!A;-5+ZsPM-=IB1tsb6w7Iw6UPDuKl(Rx`2 z#39+cUGRmjeAriR0v+kJ3iC4aLh~9hcI4+;M@{U3R?UszPf4_b6(XE~P4v)B3ZhFj zQrJ}vvbVri@_0tu!lQ7vKAIf^&I#T<^P>}#t|~3A8?6}^8_GBsku@S=#Kp#DBm_xL zdr+Y7cG(qOuSES6Qa~jkQ5u*9l3L}-M^=?u6(mJAha_)(P7T{8raQZq^V97xg)D_FoCtUR0L-@D zjFw+GKhv8)UJgo8F4M~x8sqbqUw7a)F6k?E)`db(rz}{sdycc_c|P{h55VIH!Sw<| zoJ&3xx_H~o@J?A%IIpAzPZZIE?7Q(G0yOEv+#ioZua;`V<5ihpd!lPI7yi>MR!%AWbU==V6) z1nF2E+MY6EU^UFq1|(S*`L;dP_7=`I^zIR${sen};cl`9vljKMEu!#hIZLD!8lnaX zdWR}tjZ9{wnJDUF==BsMRj-rYzFX`7d_ER?7=DZ7dU$#tXp0DMq1fCpSz)ot+^_zJtU&b%$GJ1R_hk z*QC+$(S4#En`LYrVWNutnp~{5H3_OH-4)NGd<#xWjgfZp!CW8}lre{vFuZz)YT~tu zh81xW66bR5E*7UyKP%WOpiQR#jY5#`3)I*bg`8@QcYsmJgCr*)gPSbgfo`Xdl9(bl z=<>p@H{1ejHFgf|2qj-fR!X5}@9P-_N1t6;DoS!l-b(*~j}yuVz4JEs&1@lRFK(DN zGvd-|9Im~@k=UjI=6%}uq`~U9pxBc?xI)55VU^2_z$Y>{fNl;3*Wv&zy7`Ve@FiD z1M7eKPWo3|*3${~TY0s8Y3M?pxIG<`PfZXMGD;R_1=SG@2X`1u8v-g0xqv!h)k z`%O7sIgn!a?}72|ckPjuUw~T=q3Cv^?0c>vwfa)IUmC*5?@&$MG=@~aJ$F45>b=MP zjam<@lXq}liX^{eAl+p8#DPjdvrKMB9-U^aRHcFiA}%^I%q1z;t1{5yOc6UBP3}yA zOn+{1DOsDX8QtErLQ3PGi^EnH0)Evw3ayt4DMaw1o-r^zNTU35Cn^SbSWbvtPBYfA zSCceHxz{x*lvsSHqeM(73X0GYP%%{!LgHi0M2hHYGPxKt%3+I^$YwYPc9&etm}%;M z&sXBnhc9v%Fto2)c9^g-u@5wRH*J`Zj7#~JmQ-rD&jYb6PLetfMh7LFD>7nz(b95G z>qlQ^ZkCfgQ+fzWi*&Tu$S;_)EEZpr?=tfK0f4wrnV3n6Nxp1r#ZW1A+W4%)X&nSX-{g-XW0W6xuI0#LCf}-HS*x8Ct z_?yLR=^cKT@CZq$3tzrAfI6vQ@AM?ncw8VvuHQ8A530E;%0^kjhLMp78oZvq-~DJJ zj<0RsalaE(E^(ua+efv2S^C>=LqK>psIO%EGT%rU7&JSTo?xE~kSADHA(`;_4SSs2 zVu{J=690^%kVg}tHbXV1``H)RiKdHt%8_13Es)J~A0(x1gZitk>RxgBD~0Ihk`Zlw z3qV~Hzx1UndD|e%23O4>Hhz+1Ng6BEX*UCYv#*dsA7y8v?y$sy#7ZbKNTo?md%8?s z5PO5vDNnl(3~JU;K@ep{@&%n%q77ymnbj{Qa zvUzh5#;rB4m*{}&9Zv-$)bCN!bM?+L3X=C9d_@Vm!d`UyqQE}Xd*p8AMMr=Clpd=r z$|))iX7>(Nzn2H)?wug}`0F~*^b++W0rjTbV!x{l+}t+!sgk_o`NrLfb^-Qe+oY%~ z+3mvtN&~rf7*IETtPQ`f=?&F*x!4(~d!TZDqOY5)c}$}j*3%3w(FD%8XvQpb6LW7G zGz@?1y@;lyXIXG9R*?0{cD$QY8I*os&XpYV7A5#k4zA{gH@l1$Wv)RDE*)6A3|XFK z**V@gbAp|lk~TsvmYBZ}(>f9R#F{5{<)XB(tYj)nbAnFc;2Jb!vn-`(sW{VD)xB*$ zmF!L*gEt5dJFweSwmVov7Zk(aRk<%nkX35*H?ok~Y`?x3H&%O2uB2GsV!5JE}y(!_782ubL+2oR13I9s3Nb;&#|OQsS}GF*~Cyj5Ao zC`&CWa2^P#ZibRR^qGMcmL~GYBAnMeL;()EmowNE0zxt2v~!IgnV4f4@trV zH_S=#X=usut$@4&WEp2es&f4$}Tti;eR#@KfEEmnVb zW3^;a3U#6Ax5~3XT*z@duQd2umDH3AKf+NBcM>RmA_{?D)mGc*93dgsONoC~D zb%BOOh^pqVXo8U|-ee%vn5;TM#bFsW$Fq=kgDm>UU6qzhiC8rA3)Y2#D1o)E(TVr~n0@n8h({M)##O(`na0e0P zfmiAZf0E&Iht(7R%wO3Z7C-(KK}2B?;665Bk~B!M++0j3jkP08h0yOA{8u8_FY1R! zRvNGa@DtS9Y?@y%FuWSc_f=BK9fcMMB66!%e;%ztEXy6_mQw}Njm_|@Ut{Dm+Vm4_ z)$;JbGh+I>8`W?}a0kOIKjhY@!0js>3F1c6zatsr?9&bD`Y&Sg6|Umdi1lMlh2>wh zyq`Ce<1q!7Z?`~}p9K0yL1GYl4lFwj3_)DxI+FLC8HU)uzQFzgq<>@3e*%&vHGSsp zcVt%O`(;~-{|6xbKlXfHir=`DOEN8OW1Gp$9Q-3tP||#oxF*xMC`v$=R*gbo#87#} z+F^4i;W#7{J5Od^sf6M_xNALb?^vk>1;>9tW&Sm5raV(7y>@zfbcqsW`~52PB|9^S zSIpe&{as*|B7qT24$v3G1|)v zwNC9JdPw$>vUWYik;v^M5I*3uj#F~SM+@Dg`uxO7yw&^Efpn2>m4RZJ*aPR)2F#M+ z(gmPJ3L|z9;}7gfX?HDQ4J-%5Y++tF1HGFIT%56CYo?C{qg|SVJCb#R`xh68Z4*9_ zI5v)KnVdGt4aY34ETCwODJ|TzQ19OF!yY)X;z%uZR7NnyeUQ5%;Lk48JRx-SyV}BCD;)*JB4$n(JaDn%cn3R zgA?cjRXpUXk@D5eOb4owg7Aq7NX-hC=gQH0drM`c3Z<8si+iJ3XSJcZN>F)l68OOz zWE=?u2vXQ6pb3TPt;S_DxQO7PIX&)AZg@}#+6qVQ#(>hdJM2uka_H@Kadnd-1}Zs5 z$O^)q6Paxu{W2uqNURh2>C+rRF-almM_M1ZpO-UDVf47E3Ky^)D^D&HB$923{i(=S zawRX=$#EN0q8TjRs>fJC@H5}~|0G!1@E$cX8V?5%O99$`P7icC5>E?JzdC}{zI_Oo z4OosQ|ApnIjVc$VB?wTjUwv#~UXp&ayPHaq;{wu}f)Z15%@oO<>^kxxX z_<#1yzs38aNqopYQKszyx%5%kQgSFdIp2RqI0EM+-ZH&sB}fm@*~L`IdddyZ-BC$! zL(Lw5LG_MAYE4OEX4X`RD=kbOrz9uagNhy5-fBYi4&JnL&?yQ$z`O(o@$JhZ>-6V> zy$cVxfAd*w8-Hgb={wx4!i?k#myTf?r-+zX8yITI;;7FpCcpv4Sbv)Vq5>d@A}p{vRg$4uH(5Z3 zNn+F@=G6K^LCi_`D7o8Ixy5Bk=JL?pW4|E_Qe~dW{DeYyeB99?Azdc71L^$?;KAY( znqMrIhztvD>GC=y4U&D_Bo78k{M@H^h~88c@}6T^Rc7udTl^1e;FidVLrs3ZbQROV0;ay z+SjxCe-k^xXEa97%oB?l!gyCR_4-eoC+aPm>ZxDUXAA8>t)2Z+^F)q8v4f`_HY_PABgbdL_7mzn=+ad)Ak)-hJ2)&^_o}BHFc@192+3$vRhUnQW1_!;g1k_l5%;C zPNZKhkFE@F!m@qV}jKKxlIh(Y$Fbr88$)^k7yyyuozlFMO1& zn53UOV)c;8m*-ftwlXqEOqw#lyW6CT?(&(+)GpH9k(2!e0}Mr~Cr!Cx0JYnet-+o5 zM6Qp88zD*b1vveSMf2(?cDf=7lF|k#p+BhakU1L}!!lCo{-C>GiPo)N5HV%3Fr(U| z!ko@JLOULNlDR=Y;|+);XegmjJfX?lk}y1@zL(yX_@%ZDHDT#8a7;SVP~N#K+DI0d z#LP~s|FzFXYqB$O+g5jvasOL;zW-jI0X`{9hhAO+quSlX#%%4n4(7)sfT|t=+lA%vF3wg6l#y) z`SZD<)GmxCbnp^x%M__@R>5yO#on!NDnymRpE^xuV^yw%JGQ;37s{qVKVRH2K8bcq zqLqS7-EypYqSiop~&dV0JE#tZ{Rl3=fmNUVSW2}n)z@SA9 zr+5ck@g;DFxCh?wnR^+@Yz*`hw1*jK->l96xX>;?oF`DXkfQ(`N`qxw zY@w8JlnQc8BfM2m4Di4i%aDFv+#IK@3R{1$bUHv!qamGI+#vzDagk)1KgDh~ zunop7Ezk{moiaa%A@FtHWS9TnF3C2int3%d3bwD`l)F5A!djE9C0#LHk!h5>-Iqki z^8yazHNFIN2se_dtJZCk)Ot6rbR`ZCKpJ=F7zBEM%t{1Z(NoqU=1O1AAEkK7#o5FCQ;ugEtMt&^za3JY4zP zx&TpB?uuP(;PZ{B?5VZsf8_8qcWvnw5twI9=_1XTM6}+c_gOs z^RpczXBa8knXdjV^ThGip$N?<1t8bQDwhdn(gJ4f6^r3%mh>EWre*<)L_K6A;uV`B zYK~bNk4suA8o6}~p;qj>FwAPrR+DWD;&f(3CEPHQeO1lubW{#gB@BUiLlVU#_2g4C zBGbaAn`;>-$SfvSCiQkAJ4=RPVnh1~$gC{|m*4-3vUiNmblbLtE7lv^w(X>1+o{;L zS+Q-~wo|cf+fIcQep!3p{oS?hJ?FG`TYLXJ?fpCYn0?OCNAKxKb*Y$j$Yn}))$9pj zi847w5dN>knxJp0EL}WFOfFE$R;Uknw9rDm;#}oGxBe1}9(NNn(YxF;n23%K#N<1wd{h10+%A&zYr-&wJa+4&9 z3**M{n!KA|V!b!nyD{2KnjpW3MBLRBF~O7HvA zIwv)*iW*@zalmnBotQl8fW=U|3HGw1S}ESpHZy_AG38UcicS<-lkQoGZkr4?Yh>B? zyHGuzeyZtbxquF9q`Ei@EG)S=&AS=KC4D;cyX_r#Z4^J4upl$4VVvgk!QIvsWSeBl z^2;qY3zMNT(l;8BCc(M9v@r!^eHrEf&kwQW|24jSk&rQ04}(GFDca?B)9iPsL`Y?+ z_pTmMuPdebGxn9hy!v~gvc6-2s)SP6+zos?ssD+^3xAsFj`i)^G?X+3RY63Q)3Sy8 z&(LuVgs)LAJnI`I|I!WeTjHhzT+a&~W^eDOaA z=uB44&$b{n{H~GLx6KldP7yVheox`b`=u9&-%a@-VQx>ankn`5k{vgdafqH8;2X9WtsM)SLo zT4o-_{=u2r0QW2`WR9=A@O(9F-HAxoHSMXQdJ(Kqr$R@LzC~IQzvl!|D(iX{S^oeF z9u!iX!M6HJ;}vqasE;y4YoqW$uuBK7koLKns1aU13 zoMbP!m~E_yny~(rJ9jb6qH*yW{cA^H(J#YwX3TEWfTMTS4V!75brkBZdU8to0Mc`| z>R)EYcS+Qm%WT?x4mp|!vO4lHzv)Z{iOY)LHp}W6aH#jwvWHZrp1wZ{vKFmW!yc|q zF#dKx@DOSK$u{@tw0VY#jTDyJxuRi?R%5hNWW$I8&A2=jS-z2;!p4pqY%cvp3 z@0b!?G5GB}(8HC!RSO4bKJsOppQW6j%SRw~*C8Q=DTTKDiQduiDmYT30AVP-)rnnvk^&uu6cnAl-&g4@L?(V`a3RGnvi@(=srLo zvt4TH&prW?HotK{lj<+Zyc$H=EV0~G9Cbo4eb~1`XS<|B?E;}c-Liv9anj{4Zf`4f zxFst)`c{th=1!FB`N}ABgZ+u)2d<*_%BZ~~OB27)1}w%<0MC3Lu)DKaIs>x1lbfWI zI3MNzr^m4U7ydESMi3J3j=XEW^b^ldi?j5@UOv(CVjbt@6!D=|;D@64RH1%j4~?mt zyLc=nEk2C%O;Vx>yXD{`rlVa&#Xq_8^9~TASrGe965pULMU(&9OwwqI&qVJd<)BxvZD(z zA~yGMTHD+VjyVaBOqY-O>4vmZ!nnnTd`AXAQu>5!`(7BW-!@Cdup;b;(e}~M=6Sug zrX4(Dz2NII@r+gZ2OYS^3_M`bq4{g_^(&D~b5$mKE$SdG2Mj9FrRo%D#5&Z2q(lH2 zgVrlVSA(jzkaWn?dk(im)d*ttMI6x;T1tAY76|r>L}%#`^OPZK3{V{!Q`og}D$6n# zmn#FBQVpf_hni3dO7w_eVa|ljdemOR&Tw@5>jcj9eTS{UsEsQ@)NN?_o3yRrURa2{ z(zfjRnW;8n5X=7cmA?nYs08tYghWZJo)rf{R4)EdvpH66BZ_CE)g52&C0 za+^&)CQ*{z!zc$2eOj>44R``A=KB4n?Tgd2htX(grveI*>U2nsocxB8@>mT4e&CZ8 zpY*A5)Y$`T2s0v<7itOM2w(Ii$5A$6%s-)sCnZTJeUo$Zx`#XKxT7|48ljA|a)q64{+5g{ zg?3)|UykN~=Ys!aiR`}7K7cQg>dKe=8UOEZi)#N_$N%!2H#aqN`ZrnhR=oa?__;M3 zWwu$d)vZF`Q3=0LS$+7KI-ocxzG!i?(78^md@jY}g-kZBigPDeuX%DINt`JHUq zHnPC-@eaYEqY-i2Etkt)p#C-1rZJO%dWd1R2juEi+!Xj=JVr`$Ovpo8)A&y2q#q6; zn{U8z5Xvew55d`_I;)C02lz&6^%&}Lzpz+uP`I&NF!w%5nQOx<{a{G|G{k~h4O|$> zG$qH1aqH%el7>yrL^PjWM`f5g()>aE1a0RZ`pwGKwPHdg^MT;d_LShEHY2KE>Sqjg zM38>k-_0k9Qv49F;XSB{^apXm^#c6Ovd=1&GB?wRH?|L{>3@QYs7sOXQ@?TxScfP! z1U6#kE56jvp7LfOnkL-tVLPk-POuV`xUSw4WP+_?E;L3tSaH<1AZH<|#acv8ezh+N zJcpU!2f!Cr)yT$z&iL?ejxg+ANi6DafC7Zif2p7UUOxXUtj%glON6h&y89|D;(rv@ zf36?@m-ad6Z^fnaL-EK7C~Vg#cP!?^t(>##sH_XpiN*^(K{ z`vtynW*1UWBjx{5UEyvxA(0(aOsypZc{_g4s2ckB7$z#vXTWhvwWACzzBfn4*2nvf z1v=h~dh+Lf+wgC=SB~J;%g(elXE$HAoD7xNoxBIt=1q6kkB_={KHI;UK5t3EN$0&x za27&M_xqLYoJO*~-xN2zI5lcrHqi!y^}AGc<*wIvwsaoOL?O*B@H3(`3=!i0?(E-` z?cV)TK2x45g(ZkiAIK(nHt%4^jnQ3e61n)rI;B`IKek{d)0MSUuv4yoS}Ez?25;K` zTlp+Ppvsn|;Gj>_asPRSJ zAzYb+AG=MCdWuqqJaCwEATi8BL9Qo;&J`!BjNOWyZ9Wl1W!__luSxoV{?}CG?`8DA z3+yj>eNi{A%q_&XZyR6hqO4y9W?*G&XlZU^D(0YXXC~-uZe{fKK`&rsCHU7Le`%og z9n2kVZ5)Li%w7Hy{+^}ouJ@Osq{}FwB@Ga3_0vSa{O62~-l)SuK9uqZ*+3AfM1ur@ z$l&PJg^9ka2{(5FQcSp}Rm75JMc4Tv71}Mq8JV@!xpq~{p7-MGo9?ILFZKo~B___3 zkK3(@r|wse?Tfol?=#GA+_&^l9eECgq{^YZ{ftz-A(i;UgFgPmn?V09Z4 z`k`GT`r(`9y%;ue@-YcG`$($0QAk^X>{4zpHivE%J)DELY<0i1X|557Y>~&!qg^G< z`Yu^pL0n(#Mf*B#t~57-o}TSn@UN5X;J3(kbvl26?lOTJ9vrn_^9{6c?tKqcq!}gk!9ndj}LS3P1&oCct-9c0~R|PG?zO4@|YHw26iIO zD3kO>cq+%%MOB3h^e>6zvpnK(=u7b#2nlogjFV=vL*JJg(OWD zosibkr<@}|DDdhM^yOEhaz|aHS~Mb?>x^0eDul%Lhm#md`B?}u7#jEE!!B%;gX6iAECgT!?xF)XN%Xo(R8ynm(F$6H8K ztDCgs{Q%21g1|WNH#S;GtRnHVV4>!}(>jjFU6!B>4)-6eT@uo3$2e7*Q1FWAfs_>`wrwy zVqpknBB3qCY^9EoVF%gryOpo6*4$^oM`~UiHlN_a7)nVZo1+lrB5(&IU$cJv1WM8Z zoHFl+s3K}Ib7clQu4Yga)8o%MyZ${iMiyEVNTCC0CB;yFF}SDiv9}VLCACwil5yDK z=Eb4uwYozkpfS1Rdp|wq5^v2j6K*l zmo3xi`k*rRpF1NZq(|M5u*D1X?^y9SMK+^yQ1%+R*I&Jh2_OZ&lqI$eoL z;szhF;bB>&i8t52`5ykcvj8vjj>uXC<=L>*2{wRQsJa<#l5#MA^R6i0V$MA#9(|Jk zz?-p>gk+3v_;7LqJGkW>GTy+3AkML9j+Dwj-y2c9Ceym+K_R?sJLhX=NzH`{xHe%YAr4oKItzNiL|sW?ZHMz?ui zC?*jC!8@kzO4C_Z#(j<1A0r#yQZr{K!01&7+x59_t}8fy;DlqDFTBll>?3rq4u0g} z2$O4!U5+QYvj}m1pVfmrf*Ay5Eg?S6wzyR^wo$4zR>e*>S(dYcdTF7#h1ONbonyN6 z^y-n1Ph@tkZf`h7O@~(QPt6*p;C#Wsnsr&zzU%`d!?qscw7_Y?)W4h_S~}|@e(=#qrorSjNymRucF2XQ zSP-yDKmG##+R?!kvnoT}TWflH_oTIDw2If#(ta}4E;vo&ds`~)hk%$NQaGLU?RAt*_IdtRMYi&JL&Xbd+&Ju9_OMgMh? zT&H9P0}fP59T~2YcGq~rjSi{My>CqYFZ$>C%9(_DV^TR z8z}>0Kw>Aor-?LB(ri@)^kbMuH&lne15^!9L_6DFhXBPaolyZ&iCsG1YQ_!Z zzI0#3m?w4?BtL(krP-#qkCo`pY;43nuEv>lG%Hbz^^=Zw4^7v3Q+E_@d(dSy?%c=}^JO4zfmRQ!vAl%i^ECD6#gujYvurjTbq91ed)zkYo*VNbclqfBB* z$<4Ruy{3b*Wkhh&o6iv*{NZ!XX!}k-A)r$pMcU6FZ%<4_$P#n1J1CNkHH3YPUU0fA z*d%Hj0r*01%=Ku$R@R?=|Kb#XXAb|HQ~b3~wb{J1WefK0Tj*B~E9d_Ur;z{R7Eb22 zHvi!jMXFzFKQ#;=Iw(M7L4Ifw5mmMR?1DGc8n8{B6)KEKhfF*nK$=OPscvPh-D&x% zqM83Ey-!^U_>bw+TOm3Adv^vq+nBkk_*8wvL7Y z&EHvXyEM2Jlc@b}E_libRHW}Fn8NQ0}FX+bSyQA+vDJe|a# z^~3Ea_kw4t0N`LM(|PMQ&D8Xr7V;kwlG7AtA$7F4U1?^|9p4LV2s5~tk{TVDEG!nM z%?Dn`5A*EV32Q%YB_L_=?k5jGrOM@%qy7}1CA6#m<