@@ -40,8 +40,14 @@ class ZipImportError(ImportError):
4040_module_type = type (sys )
4141
4242END_CENTRAL_DIR_SIZE = 22
43- STRING_END_ARCHIVE = b'PK\x05 \x06 '
43+ END_CENTRAL_DIR_SIZE_64 = 56
44+ END_CENTRAL_DIR_LOCATOR_SIZE_64 = 20
45+ STRING_END_ARCHIVE = b'PK\x05 \x06 ' # standard EOCD signature
46+ STRING_END_LOCATOR_64 = b'PK\x06 \x07 ' # Zip64 EOCD Locator signature
47+ STRING_END_ZIP_64 = b'PK\x06 \x06 ' # Zip64 EOCD signature
4448MAX_COMMENT_LEN = (1 << 16 ) - 1
49+ MAX_UINT32 = 0xffffffff
50+ ZIP64_EXTRA_TAG = 0x1
4551
4652class zipimporter (_bootstrap_external ._LoaderBasics ):
4753 """zipimporter(archivepath) -> zipimporter object
@@ -352,49 +358,72 @@ def _read_directory(archive):
352358 # to not cause problems when some runs 'python3 /dev/fd/9 9<some_script'
353359 start_offset = fp .tell ()
354360 try :
361+ # Check if there's a comment.
355362 try :
356- fp .seek (- END_CENTRAL_DIR_SIZE , 2 )
357- header_position = fp .tell ()
358- buffer = fp .read (END_CENTRAL_DIR_SIZE )
363+ fp .seek (0 , 2 )
364+ file_size = fp .tell ()
359365 except OSError :
360- raise ZipImportError (f"can't read Zip file: { archive !r} " , path = archive )
361- if len (buffer ) != END_CENTRAL_DIR_SIZE :
362- raise ZipImportError (f"can't read Zip file: { archive !r} " , path = archive )
363- if buffer [:4 ] != STRING_END_ARCHIVE :
364- # Bad: End of Central Dir signature
365- # Check if there's a comment.
366- try :
367- fp .seek (0 , 2 )
368- file_size = fp .tell ()
369- except OSError :
370- raise ZipImportError (f"can't read Zip file: { archive !r} " ,
371- path = archive )
372- max_comment_start = max (file_size - MAX_COMMENT_LEN -
373- END_CENTRAL_DIR_SIZE , 0 )
374- try :
375- fp .seek (max_comment_start )
376- data = fp .read ()
377- except OSError :
378- raise ZipImportError (f"can't read Zip file: { archive !r} " ,
379- path = archive )
380- pos = data .rfind (STRING_END_ARCHIVE )
381- if pos < 0 :
382- raise ZipImportError (f'not a Zip file: { archive !r} ' ,
383- path = archive )
366+ raise ZipImportError (f"can't read Zip file: { archive !r} " ,
367+ path = archive )
368+ max_comment_plus_dirs_size = (
369+ MAX_COMMENT_LEN + END_CENTRAL_DIR_SIZE +
370+ END_CENTRAL_DIR_SIZE_64 + END_CENTRAL_DIR_LOCATOR_SIZE_64 )
371+ max_comment_start = max (file_size - max_comment_plus_dirs_size , 0 )
372+ try :
373+ fp .seek (max_comment_start )
374+ data = fp .read (max_comment_plus_dirs_size )
375+ except OSError :
376+ raise ZipImportError (f"can't read Zip file: { archive !r} " ,
377+ path = archive )
378+ pos = data .rfind (STRING_END_ARCHIVE )
379+ pos64 = data .rfind (STRING_END_ZIP_64 )
380+
381+ if (pos64 >= 0 and pos64 + END_CENTRAL_DIR_SIZE_64 + END_CENTRAL_DIR_LOCATOR_SIZE_64 == pos ):
382+ # Zip64 at "correct" offset from standard EOCD
383+ buffer = data [pos64 :pos64 + END_CENTRAL_DIR_SIZE_64 ]
384+ if len (buffer ) != END_CENTRAL_DIR_SIZE_64 :
385+ raise ZipImportError (
386+ f"corrupt Zip64 file: Expected { END_CENTRAL_DIR_SIZE_64 } byte "
387+ f"zip64 central directory, but read { len (buffer )} bytes." ,
388+ path = archive )
389+ header_position = file_size - len (data ) + pos64
390+
391+ central_directory_size = int .from_bytes (buffer [40 :48 ], 'little' )
392+ central_directory_position = int .from_bytes (buffer [48 :56 ], 'little' )
393+ num_entries = int .from_bytes (buffer [24 :32 ], 'little' )
394+ elif pos >= 0 :
384395 buffer = data [pos :pos + END_CENTRAL_DIR_SIZE ]
385396 if len (buffer ) != END_CENTRAL_DIR_SIZE :
386397 raise ZipImportError (f"corrupt Zip file: { archive !r} " ,
387398 path = archive )
399+
388400 header_position = file_size - len (data ) + pos
389401
390- header_size = _unpack_uint32 (buffer [12 :16 ])
391- header_offset = _unpack_uint32 (buffer [16 :20 ])
392- if header_position < header_size :
402+ # Buffer now contains a valid EOCD, and header_position gives the
403+ # starting position of it.
404+ central_directory_size = _unpack_uint32 (buffer [12 :16 ])
405+ central_directory_position = _unpack_uint32 (buffer [16 :20 ])
406+ num_entries = _unpack_uint16 (buffer [8 :10 ])
407+
408+ # N.b. if someday you want to prefer the standard (non-zip64) EOCD,
409+ # you need to adjust position by 76 for arc to be 0.
410+ else :
411+ raise ZipImportError (f'not a Zip file: { archive !r} ' ,
412+ path = archive )
413+
414+ # Buffer now contains a valid EOCD, and header_position gives the
415+ # starting position of it.
416+ # XXX: These are cursory checks but are not as exact or strict as they
417+ # could be. Checking the arc-adjusted value is probably good too.
418+ if header_position < central_directory_size :
393419 raise ZipImportError (f'bad central directory size: { archive !r} ' , path = archive )
394- if header_position < header_offset :
420+ if header_position < central_directory_position :
395421 raise ZipImportError (f'bad central directory offset: { archive !r} ' , path = archive )
396- header_position -= header_size
397- arc_offset = header_position - header_offset
422+ header_position -= central_directory_size
423+ # On just-a-zipfile these values are the same and arc_offset is zero; if
424+ # the file has some bytes prepended, `arc_offset` is the number of such
425+ # bytes. This is used for pex as well as self-extracting .exe.
426+ arc_offset = header_position - central_directory_position
398427 if arc_offset < 0 :
399428 raise ZipImportError (f'bad central directory size or offset: { archive !r} ' , path = archive )
400429
@@ -411,6 +440,11 @@ def _read_directory(archive):
411440 raise EOFError ('EOF read where not expected' )
412441 # Start of file header
413442 if buffer [:4 ] != b'PK\x01 \x02 ' :
443+ if count != num_entries :
444+ raise ZipImportError (
445+ f"mismatched num_entries: { count } should be { num_entries } in { archive !r} " ,
446+ path = archive ,
447+ )
414448 break # Bad: Central Dir File Header
415449 if len (buffer ) != 46 :
416450 raise EOFError ('EOF read where not expected' )
@@ -426,9 +460,6 @@ def _read_directory(archive):
426460 comment_size = _unpack_uint16 (buffer [32 :34 ])
427461 file_offset = _unpack_uint32 (buffer [42 :46 ])
428462 header_size = name_size + extra_size + comment_size
429- if file_offset > header_offset :
430- raise ZipImportError (f'bad local header offset: { archive !r} ' , path = archive )
431- file_offset += arc_offset
432463
433464 try :
434465 name = fp .read (name_size )
@@ -440,7 +471,10 @@ def _read_directory(archive):
440471 # slower than reading the data because fseek flushes stdio's
441472 # internal buffers. See issue #8745.
442473 try :
443- if len (fp .read (header_size - name_size )) != header_size - name_size :
474+ extra_data_len = header_size - name_size
475+ extra_data = memoryview (fp .read (extra_data_len ))
476+
477+ if len (extra_data ) != extra_data_len :
444478 raise ZipImportError (f"can't read Zip file: { archive !r} " , path = archive )
445479 except OSError :
446480 raise ZipImportError (f"can't read Zip file: { archive !r} " , path = archive )
@@ -457,6 +491,60 @@ def _read_directory(archive):
457491
458492 name = name .replace ('/' , path_sep )
459493 path = _bootstrap_external ._path_join (archive , name )
494+
495+ # Ordering matches unpacking below.
496+ if (
497+ file_size == MAX_UINT32 or
498+ data_size == MAX_UINT32 or
499+ file_offset == MAX_UINT32
500+ ):
501+ # need to decode extra_data looking for a zip64 extra (which might not
502+ # be present)
503+ while extra_data :
504+ if len (extra_data ) < 4 :
505+ raise ZipImportError (f"can't read header extra: { archive !r} " , path = archive )
506+ tag = _unpack_uint16 (extra_data [:2 ])
507+ size = _unpack_uint16 (extra_data [2 :4 ])
508+ if len (extra_data ) < 4 + size :
509+ raise ZipImportError (f"can't read header extra: { archive !r} " , path = archive )
510+ if tag == ZIP64_EXTRA_TAG :
511+ if (len (extra_data ) - 4 ) % 8 != 0 :
512+ raise ZipImportError (f"can't read header extra: { archive !r} " , path = archive )
513+ num_extra_values = (len (extra_data ) - 4 ) // 8
514+ if num_extra_values > 3 :
515+ raise ZipImportError (f"can't read header extra: { archive !r} " , path = archive )
516+ values = struct .unpack_from (f"<{ min (num_extra_values , 3 )} Q" ,
517+ extra_data , offset = 4 )
518+
519+ # N.b. Here be dragons: the ordering of these is different than
520+ # the header fields, and it's really easy to get it wrong since
521+ # naturally-occuring zips that use all 3 are >4GB
522+ if file_size == MAX_UINT32 :
523+ file_size = values .pop (0 )
524+ if data_size == MAX_UINT32 :
525+ data_size = values .pop (0 )
526+ if file_offset == MAX_UINT32 :
527+ file_offset = values .pop (0 )
528+
529+ break
530+
531+ # For a typical zip, this bytes-slicing only happens 2-3 times, on
532+ # small data like timestamps and filesizes.
533+ extra_data = extra_data [4 + size :]
534+ else :
535+ _bootstrap ._verbose_message (
536+ "zipimport: suspected zip64 but no zip64 extra for {!r}" ,
537+ path ,
538+ )
539+ # XXX These two statements seem swapped because `central_directory_position`
540+ # is a position within the actual file, but `file_offset` (when compared) is
541+ # as encoded in the entry, not adjusted for this file.
542+ # N.b. this must be after we've potentially read the zip64 extra which can
543+ # change `file_offset`.
544+ if file_offset > central_directory_position :
545+ raise ZipImportError (f'bad local header offset: { archive !r} ' , path = archive )
546+ file_offset += arc_offset
547+
460548 t = (path , compress , data_size , file_size , file_offset , time , date , crc )
461549 files [name ] = t
462550 count += 1
0 commit comments