main 665562f57c77 cached
13 files
12.7 MB
25.0k tokens
1 symbols
1 requests
Download .txt
Repository: wjbmattingly/ocr_python_textbook
Branch: main
Commit: 665562f57c77
Files: 13
Total size: 12.7 MB

Directory structure:
gitextract_l23f_itf/

├── 01_02_libraries.py
├── 02_01_working_with_images.py
├── 02_02_working with opencv.ipynb
├── 02_02_working_with_opencv.py
├── 02_03_intro_to_pytes.ipynb
├── 03_bounding_boxes_index-Copy1.ipynb
├── 03_bounding_boxes_index_ocr.ipynb
├── 03_ocr_index-Copy1.ipynb
├── 03_ocr_index.ipynb
├── 04_01_get_whole_text.ipynb
├── 04_03_capture_side_notes.ipynb
├── 04_04_separate_footnotes.ipynb
└── README.md

================================================
FILE CONTENTS
================================================

================================================
FILE: 01_02_libraries.py
================================================
import cv2
from PIL import Image
import pytesseract


================================================
FILE: 02_01_working_with_images.py
================================================
from PIL import Image

im_file = "data/page_01.jpg"

im = Image.open(im_file)
im.save("temp/page_01.jpg")


================================================
FILE: 02_02_working with opencv.ipynb
================================================
[File too large to display: 12.6 MB]

================================================
FILE: 02_02_working_with_opencv.py
================================================
#https://nanonets.com/blog/ocr-with-tesseract/
#COVERED IN THIS VIDEO:
#


def remove_borders(img):
    contours,hierarchy = cv2.findContours(img,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
    cntsSorted = sorted(contours, key=lambda x: cv2.contourArea(x))
    cnt = cntsSorted[-1]
    print (len(contours))
    x,y,w,h = cv2.boundingRect(cnt)
    crop = img[y:y+h,x:x+w]
    return (crop)





no_borders = remove_borders(no_noise)
cv2.imwrite("temp/no_borders.jpg", no_borders)
display("temp/no_borders.jpg")

color = [255, 255, 255] # 'cause purple!

# border widths; I set them all to 150
top, bottom, left, right = [150]*4

img_with_border = cv2.copyMakeBorder(no_borders, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)

cv2.imwrite("temp/image_border.jpg", img_with_border)
display("temp/image_border.jpg")




















# image = cv2.imread('data/page_01.jpg', 0)
# clahe = cv2.createCLAHE().apply(image)
#
# sharpen_kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
# sharpen = cv2.filter2D(clahe, -1, sharpen_kernel)
#
# thresh = cv2.threshold(sharpen, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
# cv2.imshow('clahe', clahe)
# cv2.imshow('sharpen', sharpen)
# cv2.imshow('thresh', thresh)
#
# cv2.waitKey()

# # get grayscale image
# def get_grayscale(image):
#     return cv.cvtColor(image, cv.COLOR_BGR2GRAY)
# # noise removal
# def remove_noise(image):
#     return cv.medianBlur(image,5)
# img = cv.imread('data/page_01.jpg')
# img = get_grayscale(img)
# # img = remove_noise(img)
# ret,thresh1 = cv.threshold(img,127,255,cv.THRESH_BINARY)
# ret,thresh2 = cv.threshold(img,127,255,cv.THRESH_BINARY_INV)
# ret,thresh3 = cv.threshold(img,127,255,cv.THRESH_TRUNC)
# ret,thresh4 = cv.threshold(img,127,255,cv.THRESH_TOZERO)
# ret,thresh5 = cv.threshold(img,127,255,cv.THRESH_TOZERO_INV)
#
# titles = ['Original Image','BINARY','BINARY_INV','TRUNC','TOZERO','TOZERO_INV']
# images = [img, thresh1, thresh2, thresh3, thresh4, thresh5]
# for i in range(6):
#     plt.subplot(2,3,i+1),plt.imshow(images[i],'gray',vmin=0,vmax=255)
#     plt.title(titles[i])
#     plt.xticks([]),plt.yticks([])
# plt.show()


================================================
FILE: 02_03_intro_to_pytes.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pytesseract\n",
    "from PIL import Image"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "img_file = \"data/page_01.jpg\"\n",
    "no_noise = \"temp/no_noise.jpg\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'Image' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-3-1c7e5ef55aa8>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mimg\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mImage\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mno_noise\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      2\u001b[0m \u001b[0mdisplay\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mimg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mNameError\u001b[0m: name 'Image' is not defined"
     ]
    }
   ],
   "source": [
    "img = Image.open(no_noise)\n",
    "display(img)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "ocr_result = pytesseract.image_to_string(img)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "“GABRIEL Meamall\n",
      "\n",
      "On Easter movning in the year 1944, I took my six-year-old\n",
      "son by the hand and began walking fron my home town toward the\n",
      "valleys and forests of the Carpathizn mountains. For nearly\n",
      "eight months we lived in barns, attics and makeshift eabins. With\n",
      "the gene nous help of an unusually courageous man, we managed to\n",
      "survive Europe's greatest fit of madness. Those who walked in\n",
      "the opposite direction on that Easter day were lese fortunate.\n",
      "They were taken in trainloads to places whose once obscure names\n",
      "are now, and forever will be, synonymous with terror, evil and\n",
      "death. What follows is our story of survival told to the best\n",
      "of my ability, in plain, simple language.\n",
      "\n",
      "In March of 1944 the SS troops took over the internal affairs\n",
      "of Hungary and proceeded to organize the deportation of the dows.\n",
      "To the Nazie thie was a routine assignment; within hours all local\n",
      "officials were informed of operational plans. The high command\n",
      "issued a directive designed to placate Jewish fears and induce\n",
      "cooperation. It was announced that the Jews would be shipped to\n",
      "Poland as an emergency labor force and that they were only being\n",
      "drafted for temporary work. There were many who believed this\n",
      "version. Others, less credulous, resigred themselves and hoped for\n",
      "the vest. Still others began to make plans for escape. By Aprii\n",
      "13 the Hungarian Jews were being rounded up from all over the .\n",
      "country in what was once a huge brick factory. The rest is well\n",
      "known. :\n",
      "\n",
      "J was working in Ungvar and usually came home on weekends.\n",
      "\n",
      "At that time it was no longer possible for a Jew to travel freely.\n",
      "\n",
      " \n",
      "\f",
      "\n"
     ]
    }
   ],
   "source": [
    "print (ocr_result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: 03_bounding_boxes_index-Copy1.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pytesseract\n",
    "import cv2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "image = cv2.imread(\"data/index_02.JPG\")\n",
    "base_image = image.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv2.imwrite(\"temp/index_gray.png\", gray)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "blur = cv2.GaussianBlur(gray, (7,7), 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv2.imwrite(\"temp/index_blur.png\", blur)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv2.imwrite(\"temp/index_thresh.png\", thresh)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "kernal = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 13))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv2.imwrite(\"temp/index_kernal.png\", kernal)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "dilate = cv2.dilate(thresh, kernal, iterations=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv2.imwrite(\"temp/index_dilate.png\", dilate)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "cnts = cnts[0] if len(cnts) == 2 else cents[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results  = []\n",
    "for c in cnts:\n",
    "    x, y, w, h = cv2.boundingRect(c)\n",
    "    if h > 200 and w > 20:\n",
    "        roi = image[y:y+h, x:x+h]\n",
    "        cv2.rectangle(image, (x, y), (x+w, y+h), (36, 255, 12), 2)\n",
    "        ocr_result = pytesseract.image_to_string(roi)\n",
    "        ocr_result = ocr_result.split(\"\\n\")\n",
    "        for item in ocr_result:\n",
    "            results.append(item)\n",
    "cv2.imwrite(\"temp/index_bbox_new.png\", image)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[' ', '', 'A. v. E.', '', 'Abdenago 585, 3.', '', 'Abel 26, 11.', '', 'Abigail 606, 6.', '', 'Abraham 26,18. 124,18. 133, 23.', '264. 302, 13. 303, 13. 320, 40.', '496, 22.', '', 'Absalon 102, 5.', '', 'Acgfredi v. Ecgtridus.', '', 'Achab rex 504, 1.', '', 'Achaz rex 504,1.', '', 'Achiel 606, 19.', '', 'Achilon 496, 26.', '', 'Adal-, Adel-, Adhel-, Aedel-, Aed', 'Aedil-,Acdil-, Edel-, Edil-, Ath', '', ' ', '', 'Ethel-.', '', '416. 418, 14. 422, 10.', '', 'tini Turonensis 399, 3.', 'Aedilbur,', 'nia, abbatissa', '148, 32. 149, 33 [2]7 458.', '', 'Aethelfleda uxor Aethilredi', '', 'Northanhumbrorum ,', 'regis Merciorum 149, 14. 150, 4.', '', 'ventani 506, 27.', '', 'bardorum 503, 45.', 'Epistozae IV.', '', ' ', '', '   ', '   ', '  ', '  ', '    ', '   ', '   ', '  ', '  ', '    ', '    ', '   ', '', 'Aaron 38, 20. 123, 18. 168,6. 203, 38. |)', 'Abacuc propheta 36,5. 319, 29. 373, 23.', 'Abagarus rex Edessenorum 382, 13.', '', '‘|||Adalhardus,', '', '   ', '  ', '   ', '  ', ' ', '', 'Athel-, Athil-, Aethel-, Aethil-,|', '', 'Aedelbaldusabb.S. Petri Wiremuthen-', 'sis et S. Pauli Girwensis 110, 28.', '', 'Aedilberctus, Aedilberthus ep. Ha-', 'gulstadensis, antea ep. Candidae|', 'casae sive Witernensis 27,43. 72,16.', 'Adalbertus, Aedilberctus sive Magus,', 'discipulus et capellanus Arnonis)', 'archiep. Salisburg., postea_abb.', 'Ferrariensis 254, 1. 320, 41. 322, 19.)', '', '  ', '  ', '   ', '   ', '      ', '', 'Adalpertabb.[Tegernseensis ?]497,23..', 'Adalbertus presb. monasterii 8. Mar-', '', ', Edilburga, cognom. Euge-', 'ljaedanbyrgensis,', 'filia Offae regis Merciorum 78, 1.', '', 'Adalgarius ep. [Trecensis?] 510, 10,', 'Adelgisa filia Arichis ducis Bene-', '', 'Adalgisus fil. Desiderii regis Lango-|', '', 'Aethelhardus, Aethilhardus, Athel-', 'hardus, Athilhardus, Aedelhardus,', 'Aedelhardus, Aedilhardus, Aedil-|', 'hardus, Ethelardus, Edelardus', 'archiep. Cantuariensis Dorovernen-', 'sis ecclesiae metropolitanus 46, 9,', '49,9. 128,2. 145, 31, 147,5. 188. 189.', '192, 24. 193, 5 [?]. 374376, v9. 377, 4,', '412. 414,1 (2) 448. 449 [2]. 450 [2].', '451, 8 [2]. 479. 480. 614, 10. 21. 36.', '. Antonius, abb,', 'Corbeiae veteris, fil. Bernhardi', 'ae Caroli Magni 34, 25. 290.', '1. 299. 300. 364. 365. 381. 382,', '509. 566,16; frater: Bernarius;', 'fratres 381,25; soror: Gundrada,', '\\\\|Adalmondus, Aethelmodus ep. Scire-', '|| burnensis 29, 9.', 'Adelperga uxor Arichis ducis Bene-', 'ventani, filia Desiderii regis Lango-', 'bardorum 506, 5; liberi: Romual-', 'dus, Grimoaldus, Adelgisa.', '\\\\|Adalpert v, Adalbertus.', 'Aedelredus, Aedilredus, Aethelredus,', 'Aethilredus, Hedilredus rex Nor-|', 'thanhumbrorum, fil. Aedelwaldi', 'Moll regis 383,15. 35,8. 41, 19.', '49,16. 71, 20, 146,2. 147,10. 150, 6.', '151, 35. 171,17. 377,31; mater:', 'Aedilthyda; uxor: Aethelfleda;', 'famulus: Torchmundus.', '', '   ', '      ', '  ', '  ', '   ', '   ', '  ', '     ', '   ', '   ', '     ', '  ', '  ', '  ', '  ', ' ', ' ', '', ' ', ' ', '', 'capellanus Arnonis archiep. Salis-', 'burg., postea abbas Ferrariensis,', 'archiepiscopus Senonensis, 418, 14.', '422, 10.', '', 'edilthyda, Edilthruda uxor Aethel-', 'waldi Moll regis Northanhumbro-', 'rum, postea abbatissa 120,12. 151,17.', '152,31 (?]; filius: Aethelredus rex', 'Northanhumbrorum.', '', 'dildryda uxor Ecgfridi regis Nor-', 'thanhumbrorum, postea abbatissa', '(F679) 417,15.', '', 'Aedelwaldus, Aethelwoldus Moll rex', 'Northanhumbrorum 33, 16; uxor:)', 'Aedilthyda; filius: Aethelredus,', 'Adalwinus ep. Reginensis 421, 24.', '422, 21. 424, 3.', '', '    ', '  ', '    ', '', ' ', '', ' ', '', '     ', '  ', '  ', '    ', ' ', '   ', '  ', '   ', '     ', '     ', '  ', '    ', '     ', '   ', '   ', '  ', '  ', '  ', '  ', '   ', '   ', '    ', '  ', '   ', '  ', ' ', ' ', '   ', '  ', ' ', '', ' ', '   ', '    ', '  ', '    ', '  ', '', 'Aethiluuinus episcopus 28, 2.', '', 'Adam 124, 201,1. 226,7. 264, 21,', '379, 29. 466, 36-—468, v5. 591, 34.', 'Adam abb. monasterii Gemeticensis)', '', '579.', 'Adaula abbatissa Anglica 112, 4.', 'Adoredus, Hathoredus ep. Hwiccio-|', 'rum [Wigorniensis] 29, 10.', 'Adrianus v. Hadrianus.', 'Adriaticum mare 324, 41.', 'Aedel-, Aedil-, v. Adal-.', 'Aegesippus [i. e. Hegesippus] 501, 11,', 'Aegil v. Eigil.', 'Aegyptus, Egyptus 123,19. 554, 17;', '‘Aegyptii, Aegyptiaci 231. 232, 17,', '467, 36. 567,6. 571,33; Egyptiacae’', 'divitiae 470,22; Aegyptiacae tene-)', 'brae 232, 3; Acgy pit rex: Pharao;', 'Aegyptiacascola, Aegyptiaci pueri|', 'v. Carolus I imperator.', 'Aeine, Aesne, Esne ep. Herefordensis|', ', 6.', 'elberhtus archiep. Eboracensis, ma-', '‘ister Alcvini abbatis (162, 24]. 167,s.', '171,19]. [177,5]. [224,20]. [239, 28].', '332, 28]. [377,32]. (378, 31]. [429,27', 'Aelfvaldus, Aelfwaldus rex Nor-', 'thanhymbrorum, [Transhymbranae', 'gentis] 20, 27. 28, 11. 43, 8. 377, 31.', 'Aelim v. Alim.', '‘Aeneades 39, 21; v. Virgilius.', 'Aesculapius, Scolapius 397, 25.', 'Aesne v. Aeine.', 'ethel-, Aethil- v. Adal-.', 'ethiopia 478, 5.', 'fricea 124, 16; Afer: Melchiades', 'papa; Africanum concilium 526,33.', 'fricanus, sc. Scipio Afr. 575, 36.', 'gar 584, 38.', 'gareni, Aggareni 32, 19. 601, 21;', 'v, Saraceni.', 'gathensis synodus 396,', 'inus ep. Constantiensis 117, 32.', '129, 19.', '', '     ', '  ', '', '       ', '  ', '  ', ' ', '', '    ', '   ', '  ', '  ', ' ', '  ', '   ', '  ', '  ', '  ', '  ', '   ', '   ', ' ', '  ', '   ', '   ', ' ', '', '  ', '   ', '  ', '  ', '    ', '  ', '    ', '   ', '    ', '   ', '  ', '   ', '   ', '   ', '   ', '  ', '  ', '', '  ', '     ', '   ', '   ', '  ', '   ', '   ', '   ', '   ', '   ', ' ', '', '    ', ' ', '\\x0c', ' ', '', 'Aethelhardus, Aethilhardus, Athel-', 'hardus, Athilhardus, Aedelhardus,', 'Aedelhardus, Aedilhardus, Aedil-|', 'hardus, Ethelardus, Edelardus', 'archiep. Cantuariensis Dorovernen-', '', 'sis ecclesiae metropolitanus 46, 9,', '', '49,9. 128,2. 145, 31, 147,5. 188. 189.', '', '192, 24. 193, 5 [?]. 374376, v9. 377, 4,', '', '412. 414,1 (2) 448. 449 [2]. 450 [2].', '', '451, 8 [2]. 479. 480. 614, 10. 21. 36.', '', 'Adalhardus, . Antonius, abb,', 'Corbeiae veteris, fil. Bernhardi', 'ae Caroli Magni 34, 25. 290.', '', '1. 299. 300. 364. 365, 381. 382,', '509. 566,16; frater: Bernarius;', 'fratres 381,25; soror: Gundrada,', '', 'Adalmondus, Aethelmodus ep. Scire-', 'burnensis 29, 9.', '', 'Adelperga uxor Arichis ducis Bene-', 'ventani, filia Desiderii regis Lango-', 'bardorum 506, 5; liberi: Romual-', 'dus, Grimoaldus, Adelgisa.', '', 'Adalpert v, Adalbertus.', '', 'Aedelredus, Aedilredus, Aethelredus,', 'Aethilredus, Hedilredus rex Nor-|', 'thanhumbrorum, fil. Aedelwaldi', 'Moll regis 33,15. 35,8. 41, 19.', '49,16. 71, 20, 146,2. 147,10. 150, 6.', '151, 35. 171,17. 377,31; mater:', 'Aedilthyda; uxor: Aethelfleda;', 'famulus: Torchmundus.', '', 'Adhelricus diaconus, discipulus et)', 'capellanus Arnonis archiep. Salis-', 'burg., postea abbas Ferrariensis,', 'archiepiscopus Senonensis, 418, 14.', '422, 10.', '', 'Aedilthyda, Edilthruda uxor Aethel-', 'waldi Moll regis Northanhumbro-', 'rum, postea abbatissa 120,12. 151,17.', '152,31 (?]; filius: Aethelredus rex', 'Northanhumbrorum.', '', 'Edildryda uxor Ecgfridi regis Nor-', 'thanhumbrorum, postea abbatissa', '(F679) 417,15.', '', 'Aedelwaldus, Aethelwoldus Moll rex’', 'Northanhumbrorum 33, 16; uxor:)', 'Aedilthyda; filius: Aethelredus,', '', 'Adalwinus ep. Reginensis 421, 24.', '422, 21. 424, 3.', '', ' ', '', ' ', '', ' ', '', ' ', '', ' ', '', 'Aethiluuinus episcopus 28, 2.', '', 'Adam 124, 201,1. 226,7. 264, 21,', '379, 29. 466, 36-—468, v5. 591, 34.', '', 'Adam abb. monasterii Gemeticensis)', '579.', '', 'Adaula abbatissa Anglica 112, 4.', '', 'Adoredus, Hathoredus ep. Hwiccio-|', 'rum [Wigorniensis] 29, 10.', '', 'Adrianus v. Hadrianus.', '', 'Adriaticum mare 324, 41.', '', 'Aedel-, Aedil-, v. Adal-.', '', 'Aegesippus [i. e. Hegesippus] 501, 11', '', 'Aegil v. Eigil.', '', 'Aegyptus, Egyptus 123,19. 554, 17;', '‘Aegyptii, Aegyptiaci 231. 232, 17.', '467, 36. 567,6. 571,33; Egyptiacae’', 'divitiae 470, 22; Aegyptiacae tene-|', 'brae 232, 3; Aegyptt rex: Pharao;', 'Aegyptiacascola, Aegyptiaci pueri|', 'v. Carolus I imperator.', '', 'Aeine, Aesne, Esne ep. Herefordensis|', '', ', 6.', 'elberhtus archiep. Eboracensis, ma-', '‘ister Alcvini abbatis (162, 24]. 167,s.', '171,19]. [177,5]. [224,20]. [239, 28].', '332, 28]. [377,32]. (378, 31]. [429, 27].)', '', 'Aelfvaldus, Aelfwaldus rex Nor-', 'thanhymbrorum, [Transhymbranae', 'gentis] 20, 27. 28, 11. 43, 8. 377, 31.', '', 'Aelim v. Alim.', '', '‘Aeneades 39, 21; v. Virgilius.', '', 'Aesculapius, Scolapius 397, 25.', '', 'Aesne v. Aeine.', 'ethel-, Aethil- v. Adal-.', 'ethiopia 478, 5,', 'fricea 124, 16; Afer: Melchiades', 'papa; Africanum concilium 526,33.', 'fricanus, sc. Scipio Afr. 575, 36.', 'gar 584, 38.', 'gareni, Aggareni 32, 19. 601, 21;', 'v, Saraceni.', 'gathensis synodus 396,', '', 'inus ep. Constantiensis 117, 32.', '129, 19.', '', ' ', '\\x0c', 'Aethiluuinus episcopus 28, 2.', '', 'Adam 124. 201,1. 226,7. 264, 21,]', '', '379, 29. 466, 36-—468, v5. 591, 34.', '', 'Adam abb. monasterii Gemeticensis', '', '579.', '', 'Adaula abbatissa Anglica 112, 4.', '', 'Adoredus, Hathoredus ep. Hwiccio-|', 'rum [Wigorniensis] 29, 10.', '', 'Adrianus v. Hadrianus.', '', 'Adriaticum mare 324, 41.', '', 'Aedel-, Aedil-, v. Adal-.', '', 'Aegesippus [i. e. Hegesippus] 501, 11,', '', 'Aegil v. Eigil.', '', 'Aegyptus, Egyptus 123,19. 554, 17;', '‘Aegyptii, Aegyptiaci 231. 232, 17,', '467, 36. 567,6. 571,33; Egyptiacae’', 'divitiae 470, 22; Aegyptiacae tene-|', 'brae 232, 3; Aegyptt rex: Pharao;', 'Aegyptiacascola, Aegyptiaci pueri', 'v. Carolus I imperator.', '', 'Aeine, Aesne, Esne ep. Herefordensis’', '', ', 6.', 'Aelberhtus archiep. Eboracensis, ma-', '‘ister Alcvini abbatis (162, 24]. 167,s.', '171,19]. [177,5]. [224,20]. [239, 28].', '332, 28]. [377,32]. (378, 31]. [429, 27].)', '', 'Aelfvaldus, Aelfwaldus rex Nor-', 'thanhymbrorum, [Transhymbranae', 'gentis] 20, 27. 28, 11. 43, 8. 377, 31.', '', 'Aelim v. Alim.', '', 'Aeneades 39, 21; v. Virgilius.', '', 'Aesculapius, Scolapius 397, 25.', '', 'Aesne v. Aeine.', '', 'Aethel-, Aethil- v. Adal-.', '', 'Aethiopia 478, 5,', '', 'Africa 124, 16; Afer: Melchiades', 'papa; Africanum concilium 526,33.', '', 'Africanus, se. Scipio Afr. 575, 36.', '', 'Agar 584, 3s.', '', 'Agareni, Aggareni 32, 19. 601, 21;', 'v, Saraceni.', '', 'Agathensis synodus 396,', '', 'Aginus ep. Constantiensis 117, 32.', '', '129, 19.', '', ' ', '', ' ', '\\x0c']\n"
     ]
    }
   ],
   "source": [
    "print (results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "entities = []\n",
    "for item in results:\n",
    "    item = item.strip().replace(\"\\n\", \"\")\n",
    "    item = item.split(\" \")[0]\n",
    "    if len(item) > 2:\n",
    "        if item[0] == \"A\" and \"-\" not in item:\n",
    "            item = item.split(\".\")[0].replace(\",\", \"\").replace(\";\", \"\")\n",
    "            entities.append(item)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Abdenago', 'Abel', 'Abigail', 'Abraham', 'Absalon', 'Acgfredi', 'Achab', 'Achaz', 'Achiel', 'Achilon', 'Aedilbur', 'Aethelfleda', 'Aaron', 'Abacuc', 'Abagarus', 'Aedelbaldusabb', 'Aedilberctus', 'Adalbertus', 'Adalpertabb', 'Adalbertus', 'Adalgarius', 'Adelgisa', 'Adalgisus', 'Aethelhardus', 'Aedelhardus', 'Adelperga', 'Aedelredus', 'Aethilredus', 'Aedilthyda', 'Aedelwaldus', 'Aedilthyda', 'Adalwinus', 'Aethiluuinus', 'Adam', 'Adam', 'Adaula', 'Adoredus', 'Adrianus', 'Adriaticum', 'Aegesippus', 'Aegil', 'Aegyptus', 'Aegyptiacascola', 'Aeine', 'Aelfvaldus', 'Aelim', 'Aesculapius', 'Aesne', 'Aethelhardus', 'Aedelhardus', 'Adalhardus', 'Adalmondus', 'Adelperga', 'Adalpert', 'Aedelredus', 'Aethilredus', 'Aedilthyda', 'Adhelricus', 'Aedilthyda', 'Aedelwaldus', 'Aedilthyda', 'Adalwinus', 'Aethiluuinus', 'Adam', 'Adam', 'Adaula', 'Adoredus', 'Adrianus', 'Adriaticum', 'Aegesippus', 'Aegil', 'Aegyptus', 'Aegyptiacascola', 'Aeine', 'Aelfvaldus', 'Aelim', 'Aesculapius', 'Aesne', 'Aethiluuinus', 'Adam', 'Adam', 'Adaula', 'Adoredus', 'Adrianus', 'Adriaticum', 'Aegesippus', 'Aegil', 'Aegyptus', 'Aegyptiacascola', 'Aeine', 'Aelberhtus', 'Aelfvaldus', 'Aelim', 'Aeneades', 'Aesculapius', 'Aesne', 'Aethiopia', 'Africa', 'Africanus', 'Agar', 'Agareni', 'Agathensis', 'Aginus']\n"
     ]
    }
   ],
   "source": [
    "print (entities)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "entities = list(set(entities))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Adalpertabb', 'Adalgisus', 'Achiel', 'Aedelhardus', 'Aaron', 'Adelgisa', 'Africa', 'Abdenago', 'Abigail', 'Aethilredus', 'Acgfredi', 'Aginus', 'Adalbertus', 'Aedilthyda', 'Aelim', 'Adhelricus', 'Aegesippus', 'Abacuc', 'Achab', 'Adalpert', 'Agareni', 'Aeneades', 'Aedilbur', 'Absalon', 'Aedilberctus', 'Adrianus', 'Adalwinus', 'Aethiluuinus', 'Achilon', 'Aedelbaldusabb', 'Abel', 'Aethelhardus', 'Aedelwaldus', 'Aesculapius', 'Aegyptiacascola', 'Aegyptus', 'Adoredus', 'Aeine', 'Abraham', 'Abagarus', 'Achaz', 'Adalgarius', 'Aelfvaldus', 'Aelberhtus', 'Adalhardus', 'Adaula', 'Aegil', 'Aethiopia', 'Aesne', 'Adalmondus', 'Agar', 'Africanus', 'Aedelredus', 'Adam', 'Agathensis', 'Adelperga', 'Aethelfleda', 'Adriaticum']\n"
     ]
    }
   ],
   "source": [
    "print (entities)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "entities.sort()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Aaron', 'Abacuc', 'Abagarus', 'Abdenago', 'Abel', 'Abigail', 'Abraham', 'Absalon', 'Acgfredi', 'Achab', 'Achaz', 'Achiel', 'Achilon', 'Adalbertus', 'Adalgarius', 'Adalgisus', 'Adalhardus', 'Adalmondus', 'Adalpert', 'Adalpertabb', 'Adalwinus', 'Adam', 'Adaula', 'Adelgisa', 'Adelperga', 'Adhelricus', 'Adoredus', 'Adrianus', 'Adriaticum', 'Aedelbaldusabb', 'Aedelhardus', 'Aedelredus', 'Aedelwaldus', 'Aedilberctus', 'Aedilbur', 'Aedilthyda', 'Aegesippus', 'Aegil', 'Aegyptiacascola', 'Aegyptus', 'Aeine', 'Aelberhtus', 'Aelfvaldus', 'Aelim', 'Aeneades', 'Aesculapius', 'Aesne', 'Aethelfleda', 'Aethelhardus', 'Aethilredus', 'Aethiluuinus', 'Aethiopia', 'Africa', 'Africanus', 'Agar', 'Agareni', 'Agathensis', 'Aginus']\n"
     ]
    }
   ],
   "source": [
    "print (entities)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: 03_bounding_boxes_index_ocr.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pytesseract\n",
    "import cv2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "image = cv2.imread(\"data/index_02.JPG\")\n",
    "base_image = image.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv2.imwrite(\"temp/index_gray.png\", gray)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "blur = cv2.GaussianBlur(gray, (7,7), 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv2.imwrite(\"temp/index_blur.png\", blur)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv2.imwrite(\"temp/index_thresh.png\", thresh)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "kernal = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 13))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv2.imwrite(\"temp/index_kernal.png\", kernal)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "dilate = cv2.dilate(thresh, kernal, iterations=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv2.imwrite(\"temp/index_dilate.png\", dilate)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "cnts = cnts[0] if len(cnts) == 2 else cents[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "for c in cnts:\n",
    "    x, y, w, h = cv2.boundingRect(c)\n",
    "    if h > 200 and w > 20:\n",
    "        roi = image[y:y+h, x:x+h]\n",
    "        cv2.imwrite(\"temp/index_roi.png\", roi)\n",
    "        cv2.rectangle(image, (x, y), (x+w, y+h), (36, 255, 12), 2)\n",
    "cv2.imwrite(\"temp/index_bbox_new.png\", image)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: 03_ocr_index-Copy1.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pytesseract\n",
    "from PIL import Image"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "image_file = \"data/index_02.JPG\"\n",
    "img = Image.open(image_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "ocr_result = pytesseract.image_to_string(img)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INDEX NOMINUM.\n",
      "\n",
      "SCRIPSIT KAROLUS HAMPE.\n",
      "\n",
      "Maior numerus indicat paginam, minor lineam.\n",
      "\n",
      "A. v. E.\n",
      "\n",
      "Aaron 38, 20. 123, 18. 168, 6. 203, 38. |\n",
      "\n",
      "Abacuc propheta 36,5. 319,29. 373, 23.\n",
      "Abagarus rex Edessenorum 382, 13.\n",
      "Abdenago 585, 3.\n",
      "\n",
      "Abel 26, 11.\n",
      "\n",
      "Abigail 606, 6.\n",
      "\n",
      "Abraham 26,18. 124,18. 133, 23. 2:\n",
      "264. 302, 13. 303, 13. 320, 40. 328, 1.\n",
      "496, 22.\n",
      "\n",
      "Absalon 102, 5.\n",
      "\n",
      "Acgfredi v. Ecgfridus.\n",
      "\n",
      "Achab rex 504, 1.\n",
      "\n",
      "Achaz rex 504,1.\n",
      "\n",
      "Achiel 606, 19.\n",
      "\n",
      "Achilon 496, 26.\n",
      "\n",
      "Adal-, Adel-, Adhel-, Aedel-, Aedel-,\n",
      "‘Aedil-, Aed\n",
      "Athel-, Athil-, Aethel-, Aethil-,\n",
      "Ethel-.\n",
      "\n",
      "Aedelbaldusabb.S. Petri Wiremuthen-\n",
      "sis et S. Pauli Girwensis 110, 28.\n",
      "\n",
      "Aedilberctus, Aedilberthus ep. Ha-\n",
      "gulstadensis, antea ep. Candidae\n",
      "casae sive Witernensis 27,43. 72,16.\n",
      "\n",
      "Adalbertus, Aedilberctus sive Magus,\n",
      "discipulus et capellanus Arnonis\n",
      "archiep. Salisburg., postea_abb.\n",
      "Ferrariensis 254, 1. 320, 41. 322, 19.\n",
      "416. 418, 14. 422, 10.\n",
      "\n",
      "Adalpertabb. [Tegernseensis ?]497,23.\n",
      "\n",
      "Adalbertus presb. monasterii 8. Mar-\n",
      "tini Turonensis 399, 3.\n",
      "\n",
      "Aedilburga, Edilburga,cognom. Euge-\n",
      "nia, abbatissa Flaedanbyrgensis,\n",
      "filia Offae regis Merciorum 78, 1.\n",
      "148, 32. 149, 33 [?]7 458.\n",
      "\n",
      "Aethelfleda uxor Aethilredi regis\n",
      "Northanhumbrorum, filia Offae\n",
      "regis Merciorum 149, 14. 150, 4.\n",
      "\n",
      "Adalgarius ep. [Trecensis?] 510, 10.\n",
      "\n",
      "Adelgisa filia Arichis ducis Bene-\n",
      "ventani 506, 27.\n",
      "\n",
      "Adalgisus fil. Desiderii regis Lango-\n",
      "bardorum 503, 45.\n",
      "\n",
      "Epistotae IV.\n",
      "\n",
      " \n",
      "\n",
      " \n",
      "\n",
      "il-, Edel-, Edil-, Athal-, |\n",
      "\n",
      "Aethelhardus, Aethilhardus, Athel-\n",
      "hardus, Athilhardus, Aedelhardus,\n",
      "Aedelhardus, Aedilhardus, Aedil-\n",
      "hardus, Ethelardus, Edelardus\n",
      "archiep. Cantuariensis Dorovernen-\n",
      "sis ecclesiae metropolitanus 46, 9.\n",
      "49,9. 128,2. 145, 31, 147,5. 188. 189.\n",
      "192, 24. 193, 5[?]. 374376, v9. 377, 4.\n",
      "412. 414,1 (2. 448. 449 [2]. 450 [2].\n",
      "\n",
      "| 451,8([?]. 479. 480. 614, 10. 21. 36.\n",
      "\n",
      "| Adalhardus , cognom. Antonius, abb.\n",
      "Corbeiae veteris, fil. Bernhardi\n",
      "eo Caroli Magni 34, 25. 290.\n",
      "\n",
      "1. 299. 300. 364. 365, 381. 382.\n",
      "509. 566,16; frater: Bernarius;\n",
      "fratres 381,25; soror: Gundrada.\n",
      "\n",
      "Adalmondus, Aethelmodus ep. Scire-\n",
      "\n",
      "| burnensis 29,9.\n",
      "\n",
      "Adelperga uxor Arichis ducis Bene-\n",
      "\n",
      "| ventani, filia Desiderii regis Lango-\n",
      "\n",
      "bardorum 506, 5; liberi: Romual-\n",
      "dus, Grimoaldus, Adelgisa.\n",
      "\n",
      "| Adalpert v, Adalbertus.\n",
      "\n",
      "Aedelredus, Aedilredus, Aethelredus,\n",
      "Aethilredus, Hedilredus rex Nor-\n",
      "thanhumbrorum, fil. Aedelwaldi\n",
      "Moll regis 33,15. 35,8. 41, 19.\n",
      "49, 16. 71, 20. 146,2. 147,10. 150, 6.\n",
      "151, 35. 171,17. 377,31; mater:\n",
      "Aedilthyda; uxor: Aethelfleda;\n",
      "famulus: Torchmundus.\n",
      "\n",
      "Adhelricus diaconus, discipulus et\n",
      "capellanus Arnonis archiep. Salis-\n",
      "burg., postea abbas Ferrariensis,\n",
      "archiepiscopus Senonensis, 418, 14.\n",
      "422, 10.\n",
      "\n",
      "Aedilthyda, Edilthruda uxor Aethel-\n",
      "waldi Moll regis Northanhumbro-\n",
      "rum, postea abbatissa 120,12. 151,17.\n",
      "152,31 hs filius: Aethelredus rex\n",
      "Northanhumbrorum.\n",
      "\n",
      "Edildryda uxor Ecgfridi regis Nor-\n",
      "thanhumbrorum, postea abbatissa\n",
      "(t 679) 417,15.\n",
      "\n",
      "Aedelwaldus, Aethelwoldus Moll rex\n",
      "Northanhumbrorum 33, 16; uxor:\n",
      "Aedilthyda; filius: Aethelredus,\n",
      "\n",
      "Adalwinus ep. Reginensis 421, 24.\n",
      "422, 21. 424,3.\n",
      "\n",
      " \n",
      "\n",
      " \n",
      "\n",
      "Aethiluuinus episcopus 28, 2.\n",
      "\n",
      "Adam 124, 201,1. 226, 7. 264, at.\n",
      "379, 29. 466, 36—468, v5. 591, 34.\n",
      "Adam abb. monasterii Gemeticensis\n",
      "\n",
      "579.\n",
      "\n",
      "Adaula abbatissa Anglica 112, 4.\n",
      "\n",
      "Adoredus, Hathoredus ep. Hwiccio-\n",
      "rum [Wigorniensis] 29, 10.\n",
      "\n",
      "Adrianus v. Hadrianus.\n",
      "\n",
      "Adriaticum mare 324, 41.\n",
      "\n",
      "| Aedel-, Aedil-, v. Adal-.\n",
      "\n",
      "Aegesippus [#. e. Hegesippus] 501, 11.\n",
      "\n",
      "Aegil v. Eigil.\n",
      "\n",
      "Aegyptus, Egyptus 123,19. 554, 17;\n",
      "Aegyptii, Aegyptiaci 231. 232, 17.\n",
      "467, 36. 567,6. 571,33; Egyptiacae\n",
      "divitiae 470, 22; Aegyptiacae tene-\n",
      "brae 232, 3; Acgy pit rex: Pharao;\n",
      "Aegyptiacascola, Aegyptiaci pueri\n",
      "v. Carolus I imperator.\n",
      "\n",
      "Aeine, Aesne, Esne ep. Herefordensis\n",
      "29, 6.\n",
      "\n",
      "Aelberhtus archiep. Eboracensis, ma-\n",
      "\n",
      "‘ister Alevini abbatis (162, 24]. 167,s.\n",
      "171,19]. [177,5]. [224,20]. [239, 28].\n",
      "332, 28]. [377,32]. (378, 31]. [429,27\n",
      "\n",
      "Aelfvaldus, Aelfwaldus rex Nor-\n",
      "thanhymbrorum, [Transhymbranae\n",
      "gentis] 20, 27. 28, 11. 43,8. 377, 31.\n",
      "\n",
      "Aelim v. Alim.\n",
      "\n",
      "Aeneades 39, 21; v. Virgilius.\n",
      "\n",
      "Aesculapius, Scolapius 397, 25.\n",
      "\n",
      "Aesne v. Aeine,\n",
      "\n",
      "Aethel-, Aethil- v. Adal-.\n",
      "\n",
      "Aethiopia 478, 5.\n",
      "\n",
      "Africa 124, 16; Afer: Melchiades\n",
      "papa; Africanum concilium 526,33.\n",
      "\n",
      "Africanus, sc. Scipio Afr. 575, 36.\n",
      "\n",
      "Agar 584, 3s.\n",
      "\n",
      "Agareni, Aggareni 32, 19. 601, 21;\n",
      "v, Saraceni.\n",
      "\n",
      "Agathensis synodus 396,\n",
      "\n",
      "Aginus ep. Constantiensis 117, 32.\n",
      "\n",
      "129, 19.\n",
      "\n",
      "8\n",
      "\f",
      "\n"
     ]
    }
   ],
   "source": [
    "print (ocr_result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: 03_ocr_index.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from PIL import Image\n",
    "import pytesseract\n",
    "\n",
    "image_file = \"data/index_02.jpg\"\n",
    "img = Image.open(image_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "ocr_result = pytesseract.image_to_string(img)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INDEX NOMINUM.\n",
      "\n",
      "SCRIPSIT KAROLUS HAMPE.\n",
      "\n",
      "Maior numerus indicat paginam, minor lineam.\n",
      "\n",
      "A. v. E.\n",
      "\n",
      "Aaron 38, 20. 123, 18. 168, 6. 203, 38. |\n",
      "\n",
      "Abacuc propheta 36,5. 319,29. 373, 23.\n",
      "Abagarus rex Edessenorum 382, 13.\n",
      "Abdenago 585, 3.\n",
      "\n",
      "Abel 26, 11.\n",
      "\n",
      "Abigail 606, 6.\n",
      "\n",
      "Abraham 26,18. 124,18. 133, 23. 2:\n",
      "264. 302, 13. 303, 13. 320, 40. 328, 1.\n",
      "496, 22.\n",
      "\n",
      "Absalon 102, 5.\n",
      "\n",
      "Acgfredi v. Ecgfridus.\n",
      "\n",
      "Achab rex 504, 1.\n",
      "\n",
      "Achaz rex 504,1.\n",
      "\n",
      "Achiel 606, 19.\n",
      "\n",
      "Achilon 496, 26.\n",
      "\n",
      "Adal-, Adel-, Adhel-, Aedel-, Aedel-,\n",
      "‘Aedil-, Aed\n",
      "Athel-, Athil-, Aethel-, Aethil-,\n",
      "Ethel-.\n",
      "\n",
      "Aedelbaldusabb.S. Petri Wiremuthen-\n",
      "sis et S. Pauli Girwensis 110, 28.\n",
      "\n",
      "Aedilberctus, Aedilberthus ep. Ha-\n",
      "gulstadensis, antea ep. Candidae\n",
      "casae sive Witernensis 27,43. 72,16.\n",
      "\n",
      "Adalbertus, Aedilberctus sive Magus,\n",
      "discipulus et capellanus Arnonis\n",
      "archiep. Salisburg., postea_abb.\n",
      "Ferrariensis 254, 1. 320, 41. 322, 19.\n",
      "416. 418, 14. 422, 10.\n",
      "\n",
      "Adalpertabb. [Tegernseensis ?]497,23.\n",
      "\n",
      "Adalbertus presb. monasterii 8. Mar-\n",
      "tini Turonensis 399, 3.\n",
      "\n",
      "Aedilburga, Edilburga,cognom. Euge-\n",
      "nia, abbatissa Flaedanbyrgensis,\n",
      "filia Offae regis Merciorum 78, 1.\n",
      "148, 32. 149, 33 [?]7 458.\n",
      "\n",
      "Aethelfleda uxor Aethilredi regis\n",
      "Northanhumbrorum, filia Offae\n",
      "regis Merciorum 149, 14. 150, 4.\n",
      "\n",
      "Adalgarius ep. [Trecensis?] 510, 10.\n",
      "\n",
      "Adelgisa filia Arichis ducis Bene-\n",
      "ventani 506, 27.\n",
      "\n",
      "Adalgisus fil. Desiderii regis Lango-\n",
      "bardorum 503, 45.\n",
      "\n",
      "Epistotae IV.\n",
      "\n",
      " \n",
      "\n",
      " \n",
      "\n",
      "il-, Edel-, Edil-, Athal-, |\n",
      "\n",
      "Aethelhardus, Aethilhardus, Athel-\n",
      "hardus, Athilhardus, Aedelhardus,\n",
      "Aedelhardus, Aedilhardus, Aedil-\n",
      "hardus, Ethelardus, Edelardus\n",
      "archiep. Cantuariensis Dorovernen-\n",
      "sis ecclesiae metropolitanus 46, 9.\n",
      "49,9. 128,2. 145, 31, 147,5. 188. 189.\n",
      "192, 24. 193, 5[?]. 374376, v9. 377, 4.\n",
      "412. 414,1 (2. 448. 449 [2]. 450 [2].\n",
      "\n",
      "| 451,8([?]. 479. 480. 614, 10. 21. 36.\n",
      "\n",
      "| Adalhardus , cognom. Antonius, abb.\n",
      "Corbeiae veteris, fil. Bernhardi\n",
      "eo Caroli Magni 34, 25. 290.\n",
      "\n",
      "1. 299. 300. 364. 365, 381. 382.\n",
      "509. 566,16; frater: Bernarius;\n",
      "fratres 381,25; soror: Gundrada.\n",
      "\n",
      "Adalmondus, Aethelmodus ep. Scire-\n",
      "\n",
      "| burnensis 29,9.\n",
      "\n",
      "Adelperga uxor Arichis ducis Bene-\n",
      "\n",
      "| ventani, filia Desiderii regis Lango-\n",
      "\n",
      "bardorum 506, 5; liberi: Romual-\n",
      "dus, Grimoaldus, Adelgisa.\n",
      "\n",
      "| Adalpert v, Adalbertus.\n",
      "\n",
      "Aedelredus, Aedilredus, Aethelredus,\n",
      "Aethilredus, Hedilredus rex Nor-\n",
      "thanhumbrorum, fil. Aedelwaldi\n",
      "Moll regis 33,15. 35,8. 41, 19.\n",
      "49, 16. 71, 20. 146,2. 147,10. 150, 6.\n",
      "151, 35. 171,17. 377,31; mater:\n",
      "Aedilthyda; uxor: Aethelfleda;\n",
      "famulus: Torchmundus.\n",
      "\n",
      "Adhelricus diaconus, discipulus et\n",
      "capellanus Arnonis archiep. Salis-\n",
      "burg., postea abbas Ferrariensis,\n",
      "archiepiscopus Senonensis, 418, 14.\n",
      "422, 10.\n",
      "\n",
      "Aedilthyda, Edilthruda uxor Aethel-\n",
      "waldi Moll regis Northanhumbro-\n",
      "rum, postea abbatissa 120,12. 151,17.\n",
      "152,31 hs filius: Aethelredus rex\n",
      "Northanhumbrorum.\n",
      "\n",
      "Edildryda uxor Ecgfridi regis Nor-\n",
      "thanhumbrorum, postea abbatissa\n",
      "(t 679) 417,15.\n",
      "\n",
      "Aedelwaldus, Aethelwoldus Moll rex\n",
      "Northanhumbrorum 33, 16; uxor:\n",
      "Aedilthyda; filius: Aethelredus,\n",
      "\n",
      "Adalwinus ep. Reginensis 421, 24.\n",
      "422, 21. 424,3.\n",
      "\n",
      " \n",
      "\n",
      " \n",
      "\n",
      "Aethiluuinus episcopus 28, 2.\n",
      "\n",
      "Adam 124, 201,1. 226, 7. 264, at.\n",
      "379, 29. 466, 36—468, v5. 591, 34.\n",
      "Adam abb. monasterii Gemeticensis\n",
      "\n",
      "579.\n",
      "\n",
      "Adaula abbatissa Anglica 112, 4.\n",
      "\n",
      "Adoredus, Hathoredus ep. Hwiccio-\n",
      "rum [Wigorniensis] 29, 10.\n",
      "\n",
      "Adrianus v. Hadrianus.\n",
      "\n",
      "Adriaticum mare 324, 41.\n",
      "\n",
      "| Aedel-, Aedil-, v. Adal-.\n",
      "\n",
      "Aegesippus [#. e. Hegesippus] 501, 11.\n",
      "\n",
      "Aegil v. Eigil.\n",
      "\n",
      "Aegyptus, Egyptus 123,19. 554, 17;\n",
      "Aegyptii, Aegyptiaci 231. 232, 17.\n",
      "467, 36. 567,6. 571,33; Egyptiacae\n",
      "divitiae 470, 22; Aegyptiacae tene-\n",
      "brae 232, 3; Acgy pit rex: Pharao;\n",
      "Aegyptiacascola, Aegyptiaci pueri\n",
      "v. Carolus I imperator.\n",
      "\n",
      "Aeine, Aesne, Esne ep. Herefordensis\n",
      "29, 6.\n",
      "\n",
      "Aelberhtus archiep. Eboracensis, ma-\n",
      "\n",
      "‘ister Alevini abbatis (162, 24]. 167,s.\n",
      "171,19]. [177,5]. [224,20]. [239, 28].\n",
      "332, 28]. [377,32]. (378, 31]. [429,27\n",
      "\n",
      "Aelfvaldus, Aelfwaldus rex Nor-\n",
      "thanhymbrorum, [Transhymbranae\n",
      "gentis] 20, 27. 28, 11. 43,8. 377, 31.\n",
      "\n",
      "Aelim v. Alim.\n",
      "\n",
      "Aeneades 39, 21; v. Virgilius.\n",
      "\n",
      "Aesculapius, Scolapius 397, 25.\n",
      "\n",
      "Aesne v. Aeine,\n",
      "\n",
      "Aethel-, Aethil- v. Adal-.\n",
      "\n",
      "Aethiopia 478, 5.\n",
      "\n",
      "Africa 124, 16; Afer: Melchiades\n",
      "papa; Africanum concilium 526,33.\n",
      "\n",
      "Africanus, sc. Scipio Afr. 575, 36.\n",
      "\n",
      "Agar 584, 3s.\n",
      "\n",
      "Agareni, Aggareni 32, 19. 601, 21;\n",
      "v, Saraceni.\n",
      "\n",
      "Agathensis synodus 396,\n",
      "\n",
      "Aginus ep. Constantiensis 117, 32.\n",
      "\n",
      "129, 19.\n",
      "\n",
      "8\n",
      "\f",
      "\n"
     ]
    }
   ],
   "source": [
    "print (ocr_result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "lines = ocr_result.split(\"\\n\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Abdel-Malek', 'A.']\n",
      "['Abernethy', 'D. B.']\n",
      "['Achinger', 'H.']\n",
      "['Adler', 'M.']\n",
      "['The affluent worker in the\\nclass structure', '']\n",
      "['Agarwala', 'S.N.']\n",
      "[]\n",
      "['Agnelli', 'A.']\n",
      "['Agosti', 'A.']\n",
      "['Albrecht', 'W.']\n",
      "['Alexander', 'R. J.']\n",
      "['Der algerische Sozialismus']\n",
      "['Almasy', 'E.']\n",
      "['Alston', 'P. L.']\n",
      "['Altholz', 'J. L.']\n",
      "['Ammassari', 'G.']\n",
      "['Anders', 'K.']\n",
      "['Anderson', 'E. N.']\n",
      "['Andréani', 'E.']\n",
      "['Andréas', 'B.']\n",
      "['Andreasi', 'A.']\n",
      "['Andrews', 'W.']\n",
      "['Angell', 'A.']\n",
      "['Anikeev', 'V. V.']\n",
      "['Anklage und Botschaft']\n",
      "[]\n",
      "['Ansart', 'P.']\n",
      "['Anweiler', 'O.']\n",
      "['Archiv fiir Sozialgeschichte', 'IX']\n",
      "['Archives de Jules Humbert-\\nDroz', 'I']\n",
      "[]\n",
      "['Arian', 'A.']\n",
      "['Arima', 'T.']\n",
      "['Aron', 'Raymond']\n",
      "['Aronson', 'G.']\n",
      "['Ashraf', 'A.']\n",
      "['Ashtor', 'E.']\n",
      "['INDEX OF NAMES*']\n",
      "['Asociacién Internacional de\\nlos Trabajadores. Actas de\\nlos Consejos y Comisién\\nFederal de la Regién Es-\\npafiola']\n",
      "['Atkin', 'R.']\n",
      "['Atkinson', 'A. B.']\n",
      "['Austromarxismus']\n",
      "['Auty', 'Ph.']\n",
      "['Avakumovié', 'I.']\n",
      "['Avrich', 'P.']\n",
      "['Azmon', 'Y.']\n",
      "['Ba Maw', 'G.', 'S.', 'S.', 'H. C.', 'P.', 'M.', 'vide Bakunin', 'M.\\nBakunin', 'M.', 'R. J.', 'I.', 'J. A.', 'A.', 'R. H.', 'H.', 'W.', 'N. V.', 'H.', 'R.', 'O.', 'J.', 'R.', 'A.', 'J.', 'F.', 'W.', 'H. J.', 'A. W.', 'B.', 'G.']\n",
      "['Beradt', 'Ch.', 'P. L.', 'D.', 'M.', 'L.', 'E.', 'E.', 'I.', 'S.', 'J.', 'Ph.', 'L.', 'N.', 'K.', 'C. J.', 'A.', 'A.', 'H. D.', 'E.', 'A. A.', 'H. M.', 'J.', 'V.', 'A.', 'G.-E.', 'M.', 'F.', 'A.', 'G.', 'E.', 'D. M.', 'H.', 'A.', 'Rh.', 'R.', 'G. M.', 'C.', 'S.', 'A.', 'L.', 'V.', 'M. J.', 'M. J.']\n",
      "['']\n",
      "['* References to publications listed under the heading of Other Books are printed in italics.']\n"
     ]
    }
   ],
   "source": [
    "for line in lines:\n",
    "    temp_line = line.replace(\",\", \"\")\n",
    "    if temp_line.isdigit():\n",
    "        pass\n",
    "    else:\n",
    "        components = []\n",
    "        segs = line.split(\",\")\n",
    "        for seg in segs:\n",
    "            seg = seg.strip()\n",
    "            num = False\n",
    "            for character in seg:\n",
    "                if character.isdigit():\n",
    "                    num = True\n",
    "            if num == False:\n",
    "                components.append(seg)\n",
    "        print (components)\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: 04_01_get_whole_text.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pytesseract\n",
    "import cv2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "image = cv2.imread(\"data/sample_mgh.JPG\")\n",
    "base_image = image.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)\n",
    "blur = cv2.GaussianBlur(gray, (7,7), 0)\n",
    "thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "kernal = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 50))\n",
    "dilate = cv2.dilate(thresh, kernal, iterations=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv2.imwrite(\"temp/sample_dilated.png\", dilate)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)\n",
    "cnts = cnts[0] if len(cnts) == 2 else cnts[1]\n",
    "cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "for c in cnts:\n",
    "    x,y,w,h = cv2.boundingRect(c)\n",
    "    if h > 200 and w > 250:\n",
    "        roi = base_image[y:y+h, x:x+w]\n",
    "        cv2.rectangle(image, (x,y), (x+w, y+h), (36, 255, 12), 2)\n",
    "\n",
    "cv2.imwrite(\"temp/sample_boxes.png\", image)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "ocr_result_original = pytesseract.image_to_string(base_image)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "18 ALCVINI EPISTOLAE. 1.\n",
      "\n",
      "1,\n",
      "\n",
      "773-786 Alevinus amicum in monasterium se abditurum confirmat. De Benedicto monacho.\n",
      "773—786.\n",
      "Cod. H fol. 61'—62'.\n",
      "Edd. F211, J. 8, — Migne col. 485.\n",
      "\n",
      "DILECTISSIMO* AMICO TOTIUS PROSPERITATIS PRAESENTIS ET AETERNAE\n",
      "BEATITUDINIS PERPETUAM SALUTEM.\n",
      "Magna mihi laetitia est de bona voluntate vestra, quam audivi a fratre nostro\n",
      "Benedicto! in vobis esse. Opto atque Deum deprecor, ut citius cum omni convenien-\n",
      "Pont tia perficiatur. Scriptum est enim: ‘Ne tardes converti ad dominum Deum; quia\n",
      "nescis, quid ventura pariat dies’, Erue te de harum carcere tribulationum, quae in\n",
      "Ps. 33, 20.hoe mundo fidelium animos torquere solent®; sicut scriptum est: ‘Multae tribulationes\n",
      "iustorum’; ut, quod sequitur, tibi evenire merearis: ‘Sed de his omnibus liberavit eos\n",
      "cf. Luc.9,62. Dominus’. Et cave diligentissime, ne qua te, aratrum Domini tenentem, iniustitia\n",
      "retro revocet. Nemo miles sarcinis alienis onustus ad bella bene procedit, nisi armis\n",
      "tantummodo victricibus, vel ad defensionem sui vel ad laesionem adversarii.\n",
      "\n",
      "Omnia quae vobis demandare necessaria videbantur mihi fidelissimo fratri Bene-\n",
      "dicto dixi: loca, adiutorium et animi constantiam.\n",
      "\n",
      "Sed scire debes, quod in omni loco, ubi hominum conversatio est plurimorum,\n",
      "utrumque et boni et mali inveniuntur. Sed sapiens animus utrorumque utatur magi-\n",
      "sterio; id est, ut malorum caveat malitiam, et bonorum sequatur iustitiam. Mens\n",
      "regalis®, quae homini data est, discernere debet, quae sint cavenda et quae sint se-\n",
      "quenda; nec multum de loco diffidere vel etiam confidere. Quia, si locus adiuvare\n",
      "potuisset, numquam angelus de caelo cecidisset vel homo in paradyso positus pec-\n",
      "\n",
      "Lue. 17,21. casset. Sed regnum Dei, ut ipsa veritas ait, intra nosmetipsos quaerendum est. Et\n",
      "\n",
      "Ps, $3, 10. psalmista: ‘Timete Dominum omnes sancti eius, quia nihil deest timentibus eum’.\n",
      "Timor Domini peccare vetat, dum homo ubique Dei sibi praesentiam agnoscit et timet.\n",
      "Quem® qui conscium habet cogitationum verborum vel operum suorum, hunc habi-\n",
      "turus est et iudicem. Nec eum quicquam effugit nostri nec aliquid iniudicatum de-\n",
      "\n",
      "Anjgi221% miserit; quia, sicut dictum est, unicuique reddet secundum opera sua, Dum tempus\n",
      "habemus, operemur bonum, quia post mortem non est tempus operandi, sed tempus\n",
      "mercedem recipiendi. Haec cogitans, carissime fili, tui ipsius curam habeto, memor,\n",
      "de quantis te liberavit Deus periculis. Illum ama et ad eius misericordiam conver-\n",
      "tere, ut deleantur delicta tua et merearis locum refrigerii lucis et pacis recipere cum\n",
      "sanctis Dei. Meique memor cum Dei servientibus pro teque intercedentibus valeas\n",
      "perpetua prosperitate, dulcissime amice.\n",
      "\n",
      " \n",
      "\n",
      "1. 4) dilectissimis >) solet Z °) Quia H\n",
      "\n",
      "1) Hune Benedictum fratrem eundem esse existimo, per quem Alevinus Leutfredum episcopum de\n",
      "se ipso certiorem facere voluit, ideoque hanc epistolam eidem tempori, cui et sequentem, assignare velim,\n",
      "praesertim cum in codice alteri antecedat. 2) 4, €, mens, quae hominem regit. J,\n",
      "\n",
      "15\n",
      "\n",
      "3\n",
      "\f",
      "\n"
     ]
    }
   ],
   "source": [
    "print(ocr_result_original)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ocr_result_new = pytesseract.image_to_string(roi)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "por"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: 04_03_capture_side_notes.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "786\n",
      "Ez, 13, 5.\n",
      "“£117,\n",
      "\n",
      "Ez. 8, 18. 19.\n",
      "\n",
      "Toh. 10, 11.\n",
      "\n",
      "Tudae 12.\n",
      "Matth. 25, 21.\n",
      "\n",
      "1, Petr. 5,3.\n",
      "\n",
      "\"fay.\n",
      "\n",
      "ef, Luc, 12,\n",
      "35.\n",
      "\n",
      "“col. 578,\n",
      "\n",
      "Matth.23, 27,\n",
      "Matth. 6, 1.\n",
      "\f",
      "\n"
     ]
    }
   ],
   "source": [
    "import pytesseract\n",
    "import cv2\n",
    "\n",
    "\n",
    "image = cv2.imread('data/sample_mgh_2.jpg')\n",
    "base_image = image.copy()\n",
    "\n",
    "gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)\n",
    "blur = cv2.GaussianBlur(gray, (7,7), 0)\n",
    "thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]\n",
    "\n",
    "# Create rectangular structuring element and dilate\n",
    "kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,25))\n",
    "dilate = cv2.dilate(thresh, kernel, iterations=1)\n",
    "\n",
    "# Find contours and draw rectangle\n",
    "cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)\n",
    "cnts = cnts[0] if len(cnts) == 2 else cnts[1]\n",
    "cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[1])\n",
    "main_text = \"\"\n",
    "for c in cnts:\n",
    "    x,y,w,h = cv2.boundingRect(c)\n",
    "    if h > 200 and w > 250:\n",
    "        roi = base_image[y:y+h, 0:x]\n",
    "#         cv2.rectangle(image, (0, y), (x, 0 + h+20), (36,255,12), 2)\n",
    "        \n",
    "        constant= cv2.copyMakeBorder(roi.copy(),30,30,30,30,cv2.BORDER_CONSTANT,value=[255,255,255])\n",
    "        ocr_result = pytesseract.image_to_string(constant)\n",
    "        cv2.imwrite(\"temp/output.png\", roi)\n",
    "        \n",
    "        print (ocr_result)\n",
    "#         print (ocr_result)\n",
    "# cv2.imwrite(\"temp/output.png\", image)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'img' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-2-6f996add95e2>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mocr_result\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpytesseract\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mimage_to_string\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mimg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m: name 'img' is not defined"
     ]
    }
   ],
   "source": [
    "ocr_result = pytesseract.image_to_string(img)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print (ocr_result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lines = ocr_result.split(\"\\n\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for line in lines:\n",
    "    temp_line = line.replace(\",\", \"\")\n",
    "    if temp_line.isdigit():\n",
    "        pass\n",
    "    else:\n",
    "        components = []\n",
    "        segs = line.split(\",\")\n",
    "        for seg in segs:\n",
    "            seg = seg.strip()\n",
    "            num = False\n",
    "            for character in seg:\n",
    "                if character.isdigit():\n",
    "                    num = True\n",
    "            if num == False:\n",
    "                components.append(seg)\n",
    "        print (components)\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: 04_04_separate_footnotes.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pytesseract\n",
    "import cv2\n",
    "\n",
    "\n",
    "image = cv2.imread('data/sample_mgh.jpg')\n",
    "im_h, im_w, im_d = image.shape\n",
    "base_image = image.copy()\n",
    "\n",
    "gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)\n",
    "blur = cv2.GaussianBlur(gray, (7,7), 0)\n",
    "thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]\n",
    "\n",
    "# Create rectangular structuring element and dilate\n",
    "kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (50,10))\n",
    "dilate = cv2.dilate(thresh, kernel, iterations=1)\n",
    "\n",
    "# Find contours and draw rectangle\n",
    "cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)\n",
    "cnts = cnts[0] if len(cnts) == 2 else cnts[1]\n",
    "cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[1])\n",
    "for c in cnts:\n",
    "    x,y,w,h = cv2.boundingRect(c)\n",
    "    if h < 20 and w > 250:\n",
    "        roi = base_image[0:y+h, 0:x+im_w]\n",
    "        cv2.rectangle(image, (x, y), (x + w, y + h), (36,255,12), 2)\n",
    "        \n",
    "cv2.imwrite(\"temp/output.png\", roi)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: README.md
================================================
# <center>Introduction to OCR with Python</center>
### <center>by Dr. W.J.B. Mattingly</center>

## Introduction
Optical Character Recognition, or OCR, is a common task in many domains. The earliest OCR systems were designed to serve the vision impaired. Its modern application, however, has extended to a far wider population. The goal of OCR is to take an input image and output raw text while maintaining the structure of the text in the image. In othere words, its end-goal is to preserve the line breaks, paragraph segmentation, and other features of the structure of the text on the page.

This course is designed to teach you how to automate OCR in Python for optimized results. It is meant to function alongside this YouTube Series [OCR in Python Tutorials](https://www.youtube.com/playlist?list=PL2VXyKi-KpYuTAZz__9KVl1jQz74bDG7i)

## Organization of Textbook

|Lesson|Name|
|------|----------|
|01.01   |Introduction to OCR|
|01.02   |Introduction to the Libraries   |
|01.03   |How to Install Libraries   |
|02.01   |The Basics of Pillow|
|02.02   |The Basics of OpenCV   |
|02.03   |The Basics of Tesseract   |
|03.01   |Passing Pillow Images to OpenCV   |
|03.02   |The Basics of OpenCV   |
|03.03   |Manipulating the Image   |
|04.01   |Bounding Boxes   |
|04.02   |Extracting Bounding Boxes |
|04.03   |Organizing Bounding Boxes |
|05.01   |Parameters of Tesseract   |
|05.02   |Cleaning the Output of Tesseract  |
|06.01   |Workflow for Standard OCR of Text|
|06.02   |Workflow for Ignoring Footnotes   |
|06.03   |Workflow for Tables   |
|07.01   |Tesseract with non-English   |
|07.02   |Tesseract with Early Modern Scripts   |
|07.03   |Tesseract with non-Latin Scripts   |
Download .txt
gitextract_l23f_itf/

├── 01_02_libraries.py
├── 02_01_working_with_images.py
├── 02_02_working with opencv.ipynb
├── 02_02_working_with_opencv.py
├── 02_03_intro_to_pytes.ipynb
├── 03_bounding_boxes_index-Copy1.ipynb
├── 03_bounding_boxes_index_ocr.ipynb
├── 03_ocr_index-Copy1.ipynb
├── 03_ocr_index.ipynb
├── 04_01_get_whole_text.ipynb
├── 04_03_capture_side_notes.ipynb
├── 04_04_separate_footnotes.ipynb
└── README.md
Download .txt
SYMBOL INDEX (1 symbols across 1 files)

FILE: 02_02_working_with_opencv.py
  function remove_borders (line 6) | def remove_borders(img):
Condensed preview — 13 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (74K chars).
[
  {
    "path": "01_02_libraries.py",
    "chars": 55,
    "preview": "import cv2\r\nfrom PIL import Image\r\nimport pytesseract\r\n"
  },
  {
    "path": "02_01_working_with_images.py",
    "chars": 112,
    "preview": "from PIL import Image\r\n\r\nim_file = \"data/page_01.jpg\"\r\n\r\nim = Image.open(im_file)\r\nim.save(\"temp/page_01.jpg\")\r\n"
  },
  {
    "path": "02_02_working_with_opencv.py",
    "chars": 2226,
    "preview": "#https://nanonets.com/blog/ocr-with-tesseract/\r\n#COVERED IN THIS VIDEO:\r\n#\r\n\r\n\r\ndef remove_borders(img):\r\n    contours,h"
  },
  {
    "path": "02_03_intro_to_pytes.ipynb",
    "chars": 4456,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n "
  },
  {
    "path": "03_bounding_boxes_index-Copy1.ipynb",
    "chars": 19790,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n "
  },
  {
    "path": "03_bounding_boxes_index_ocr.ipynb",
    "chars": 4642,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n "
  },
  {
    "path": "03_ocr_index-Copy1.ipynb",
    "chars": 8352,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n "
  },
  {
    "path": "03_ocr_index.ipynb",
    "chars": 11314,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n "
  },
  {
    "path": "04_01_get_whole_text.ipynb",
    "chars": 6922,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n "
  },
  {
    "path": "04_03_capture_side_notes.ipynb",
    "chars": 4785,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\":"
  },
  {
    "path": "04_04_separate_footnotes.ipynb",
    "chars": 2563,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 31,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\""
  },
  {
    "path": "README.md",
    "chars": 1693,
    "preview": "# <center>Introduction to OCR with Python</center>\n### <center>by Dr. W.J.B. Mattingly</center>\n\n## Introduction\nOptical"
  }
]

// ... and 1 more files (download for full content)

About this extraction

This page contains the full source code of the wjbmattingly/ocr_python_textbook GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 13 files (12.7 MB), approximately 25.0k tokens, and a symbol index with 1 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Copied to clipboard!