From 0d08cf8f704bc7d81faa67ce81edd324f44f73e0 Mon Sep 17 00:00:00 2001 From: Marek Kasik Date: Mon, 14 Apr 2014 17:56:01 +0200 Subject: [PATCH 2/2] Improve efficiency of searching for tables Search for left/right/up/down neighbours of each block. Use these blocks to determine whether the actual block belong to a table or not (#3188). --- poppler/TextOutputDev.cc | 242 ++++++++++++++++++++++++++++++++++++----------- poppler/TextOutputDev.h | 4 + 2 files changed, 189 insertions(+), 57 deletions(-) diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 29e9f56..467df49 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -1224,6 +1224,8 @@ TextBlock::TextBlock(TextPage *pageA, int rotA) { rot = rotA; xMin = yMin = 0; xMax = yMax = -1; + ExMin = EyMin = 0; + ExMax = EyMax = -1; priMin = 0; priMax = page->pageWidth; pool = new TextPool(); @@ -1231,6 +1233,10 @@ TextBlock::TextBlock(TextPage *pageA, int rotA) { curLine = NULL; next = NULL; stackNext = NULL; + left = NULL; + right = NULL; + up = NULL; + down = NULL; tableId = -1; tableEnd = gFalse; } @@ -2427,6 +2433,22 @@ void TextPage::addLink(int xMin, int yMin, int xMax, int yMax, AnnotLink *link) links->append(new TextLink(xMin, yMin, xMax, yMax, link)); } +typedef struct { + TextBlock *blk; + TextBlock *blk_end; + double value; +} Range; + +int cmpRange(const void *p1, const void *p2) { + Range *r1 = *(Range **)p1; + Range *r2 = *(Range **)p2; + double cmp = 0.0; + + cmp = r1->value - r2->value; + + return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; +} + void TextPage::coalesce(GBool physLayout, double fixedPitch, GBool doHTML) { UnicodeMap *uMap; TextPool *pool; @@ -3143,7 +3165,126 @@ void TextPage::coalesce(GBool physLayout, double fixedPitch, GBool doHTML) { visited[i] = gFalse; } - double bxMin0, byMin0, bxMin1, byMin1; + /* + * Find adjacent blocks + */ + Range **ranges = new Range* [2 * nBlocks]; + Range *range1 = NULL, *range2 = NULL; + int k = 0, l = 0; + + for (blk1 = blkList; blk1; blk1 = blk1->next) { + range1 = new Range[1]; + range2 = new Range[1]; + + range1->blk = blk1; + range2->blk = NULL; + + range1->blk_end = NULL; + range2->blk_end = blk1; + + range1->value = blk1->yMin; + range2->value = blk1->yMax; + + ranges[k] = range1; + ranges[k + 1] = range2; + + k += 2; + } + + /* + * Sort all y coordinates + */ + qsort (ranges, 2 * nBlocks, sizeof (Range *), cmpRange); + + /* + * Find the closest neighbour (left/right) of each block + */ + for (k = 0; k < (2 * nBlocks); k++) { + range1 = ranges[k]; + + if (range1->blk) { + blk1 = range1->blk; + l = k + 1; + + while (ranges[l]->value < range1->blk->yMax) { + range2 = ranges[l]; + + if (range2->blk) { + blk2 = range2->blk; + + if (blk2->xMin > blk1->xMax) { + if (blk1->right == NULL || blk2->xMin < blk1->right->xMin) + blk1->right = blk2; + + if (blk2->left == NULL || blk1->xMax > blk2->left->xMax) + blk2->left = blk1; + } + else if (blk2->xMax < blk1->xMin) { + if (blk1->left == NULL || blk2->xMax > blk1->left->xMax) + blk1->left = blk2; + + if (blk2->right == NULL || blk1->xMin < blk2->right->xMin) + blk2->right = blk1; + } + } + + l++; + } + } + } + + for (k = 0; k < (2 * nBlocks); k++) { + range1 = ranges[k]; + + if (range1->blk) + range1->value = range1->blk->xMin; + + if (range1->blk_end) + range1->value = range1->blk_end->xMax; + } + + /* + * Sort all x coordinates + */ + qsort (ranges, 2 * nBlocks, sizeof (Range *), cmpRange); + + /* + * Find the closest neighbour (up/down) of each block + */ + for (k = 0; k < (2 * nBlocks); k++) { + range1 = ranges[k]; + + if (range1->blk) { + blk1 = range1->blk; + l = k + 1; + + while (ranges[l]->value < range1->blk->xMax) { + range2 = ranges[l]; + + if (range2->blk) { + blk2 = range2->blk; + + if (blk2->yMin > blk1->yMax) { + if (blk1->down == NULL || blk2->yMin < blk1->down->yMin) + blk1->down = blk2; + + if (blk2->up == NULL || blk1->yMax > blk2->up->yMax) + blk2->up = blk1; + } + else if (blk2->yMax < blk1->yMin) { + if (blk1->up == NULL || blk2->yMax > blk1->up->yMax) + blk1->up = blk2; + + if (blk2->down == NULL || blk1->yMin < blk2->down->yMin) + blk2->down = blk1; + } + } + + l++; + } + } + } + int numTables = 0; int tableId = -1; int correspondenceX, correspondenceY; @@ -3158,11 +3299,6 @@ void TextPage::coalesce(GBool physLayout, double fixedPitch, GBool doHTML) { blk1->EyMin = blk1->yMin; blk1->EyMax = blk1->yMax; - bxMin0 = DBL_MAX; - byMin0 = DBL_MAX; - bxMin1 = DBL_MAX; - byMin1 = DBL_MAX; - fblk2 = NULL; fblk3 = NULL; fblk4 = NULL; @@ -3170,33 +3306,11 @@ void TextPage::coalesce(GBool physLayout, double fixedPitch, GBool doHTML) { /* find fblk2, fblk3 and fblk4 so that * fblk2 is on the right of blk1 and overlap with blk1 in y axis * fblk3 is under blk1 and overlap with blk1 in x axis - * fblk4 is under blk1 and on the right of blk1 - * and they are closest to blk1 + * fblk4 is under fblk2 and on the right of fblk3 */ - for (blk2 = blkList; blk2; blk2 = blk2->next) { - if (blk2 != blk1) { - if (blk2->yMin <= blk1->yMax && - blk2->yMax >= blk1->yMin && - blk2->xMin > blk1->xMax && - blk2->xMin < bxMin0) { - bxMin0 = blk2->xMin; - fblk2 = blk2; - } else if (blk2->xMin <= blk1->xMax && - blk2->xMax >= blk1->xMin && - blk2->yMin > blk1->yMax && - blk2->yMin < byMin0) { - byMin0 = blk2->yMin; - fblk3 = blk2; - } else if (blk2->xMin > blk1->xMax && - blk2->xMin < bxMin1 && - blk2->yMin > blk1->yMax && - blk2->yMin < byMin1) { - bxMin1 = blk2->xMin; - byMin1 = blk2->yMin; - fblk4 = blk2; - } - } - } + fblk2 = blk1->right; + fblk3 = blk1->down; + fblk4 = (fblk2 && fblk3 && fblk2->down == fblk3->right) ? fblk2->down : NULL; /* fblk4 can not overlap with fblk3 in x and with fblk2 in y * fblk2 can not overlap with fblk3 in x and y @@ -3366,42 +3480,56 @@ void TextPage::coalesce(GBool physLayout, double fixedPitch, GBool doHTML) { /* set extended bounding boxes of all other blocks * so that they extend in x without hitting neighbours */ - for (blk1 = blkList; blk1; blk1 = blk1->next) { - if (!(blk1->tableId >= 0)) { - double xMax = DBL_MAX; - double xMin = DBL_MIN; + for (k = 0; k < (2 * nBlocks); k++) { + range1 = ranges[k]; + double xMaxValue, xMinValue; - for (blk2 = blkList; blk2; blk2 = blk2->next) { - if (blk2 == blk1) - continue; + if (range1->blk) { + blk1 = range1->blk; + l = k + 1; - if (blk1->yMin <= blk2->yMax && blk1->yMax >= blk2->yMin) { - if (blk2->xMin < xMax && blk2->xMin > blk1->xMax) - xMax = blk2->xMin; + if (blk1->right) + xMaxValue = blk1->right->xMin; + else + xMaxValue = DBL_MAX; - if (blk2->xMax > xMin && blk2->xMax < blk1->xMin) - xMin = blk2->xMax; - } + if (blk1->left) + xMinValue = blk1->left->xMax; + else + xMinValue = DBL_MIN; + + while (l < (2 * nBlocks) && ranges[l]->value <= xMaxValue) { + range2 = ranges[l]; + + if (range2->blk_end && + range2->value > blk1->ExMax && + range2->blk_end->yMin >= blk1->yMax) + blk1->ExMax = range2->value; + + l++; } - for (blk2 = blkList; blk2; blk2 = blk2->next) { - if (blk2 == blk1) - continue; + l = k - 1; - if (blk2->xMax > blk1->ExMax && - blk2->xMax <= xMax && - blk2->yMin >= blk1->yMax) { - blk1->ExMax = blk2->xMax; - } + while (l >= 0 && ranges[l]->value >= xMinValue) { + range2 = ranges[l]; - if (blk2->xMin < blk1->ExMin && - blk2->xMin >= xMin && - blk2->yMin >= blk1->yMax) - blk1->ExMin = blk2->xMin; + if (range2->blk && + range2->value < blk1->ExMin && + range2->blk->yMin >= blk1->yMax) + blk1->ExMin = range2->value; + + l--; } } } + for (k = 0; k < (2 * nBlocks); k++) { + delete[] ranges[k]; + } + delete[] ranges; + + // sort blocks into yx order (in preparation for reading order sort) TextBlock **yxordered; yxordered = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *)); diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h index 61ffad9..4546058 100644 --- a/poppler/TextOutputDev.h +++ b/poppler/TextOutputDev.h @@ -398,6 +398,10 @@ private: TextBlock *next; TextBlock *stackNext; + TextBlock *left; + TextBlock *right; + TextBlock *up; + TextBlock *down; friend class TextLine; friend class TextLineFrag; -- 1.9.0