pdfminer · pietermarsman · Oct 13, 2021 · Feb 26, 2021 · Mar 15, 2021 · May 28, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,11 @@ All notable changes in pdfminer.six will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
+## [Unreleased]
+
+### Added
+- Support for identity cmap's ([#626](https://github.com/pdfminer/pdfminer.six/pull/626))
+
 ## [20211012]
 
 ### Added

diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py
@@ -160,6 +160,13 @@ def dump(self, out: TextIO = sys.stdout) -> None:
         return
 
 
+class IdentityUnicodeMap(UnicodeMap):
+    def get_unichr(self, cid: int) -> str:
+        """Interpret character id as unicode codepoint"""
+        log.debug('get_unichr: %r, %r', self, cid)
+        return chr(cid)
+
+
 class FileCMap(CMap):
 
     def add_code2cid(self, code: str, cid: int) -> None:

diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py
@@ -7,6 +7,7 @@
 
 from . import settings
 from .cmapdb import CMap
+from .cmapdb import IdentityUnicodeMap
 from .cmapdb import CMapBase
 from .cmapdb import CMapDB
 from .cmapdb import CMapParser
@@ -763,9 +764,17 @@ def __init__(
                                BytesIO(self.fontfile.get_data()))
         self.unicode_map: Optional[UnicodeMap] = None
         if 'ToUnicode' in spec:
-            strm = stream_value(spec['ToUnicode'])
-            self.unicode_map = FileUnicodeMap()
-            CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
+            if isinstance(spec['ToUnicode'], PDFStream):
+                strm = stream_value(spec['ToUnicode'])
+                self.unicode_map = FileUnicodeMap()
+                CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
+            else:
+                cmap_name = literal_name(spec['ToUnicode'])
+                encoding = literal_name(spec['Encoding'])
+                if 'Identity' in cid_ordering \
+                        or 'Identity' in cmap_name \
+                        or 'Identity' in encoding:
+                    self.unicode_map = IdentityUnicodeMap()
         elif self.cidcoding in ('Adobe-Identity', 'Adobe-UCS'):
             if ttf:
                 try:

diff --git a/samples/contrib/issue-625-identity-cmap.pdf b/samples/contrib/issue-625-identity-cmap.pdf
diff --git a/tests/test_highlevel_extracttext.py b/tests/test_highlevel_extracttext.py
@@ -33,6 +33,7 @@ def run_with_file(sample_path):
     "simple4.pdf": "Text1\nText2\nText3\n\n\f",
     "contrib/issue_566_test_1.pdf": "ISSUE Date：2019-4-25 Buyer：黎荣",
     "contrib/issue_566_test_2.pdf": "甲方：中国饮料有限公司（盖章）",
+    "contrib/issue-625-identity-cmap.pdf": "Termin płatności: 2021-05-03",
 }
 
 
@@ -92,6 +93,12 @@ def test_issue_566_cid_range(self):
         s = run_with_file(test_file)
         self.assertEqual(s.strip(), test_strings[test_file])
 
+    def test_issue_625_identity_cmap(self):
+        test_file = "contrib/issue-625-identity-cmap.pdf"
+        lines = run_with_file(test_file).splitlines()
+
+        self.assertEqual(lines[6], test_strings[test_file])
+
 
 class TestExtractPages(unittest.TestCase):
     def _get_test_file_path(self):