Use Google Drive OCR

authenticate

https://developers.google.com/drive/v3/web/quickstart/python#step_1_turn_on_the_api_name

Run following code

from __future__ import print_function
import httplib2
import os

from apiclient import discovery
from oauth2client import client
from oauth2client import tools
from oauth2client.file import Storage

try:
    import argparse
    flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
except ImportError:
    flags = None

# If modifying these scopes, delete your previously saved credentials
# at ~/.credentials/drive-python-quickstart.json
SCOPES = 'https://www.googleapis.com/auth/drive'
CLIENT_SECRET_FILE = 'client_secret.json'
APPLICATION_NAME = 'OCR'


def get_credentials():
    """Gets valid user credentials from storage.

    If nothing has been stored, or if the stored credentials are invalid,
    the OAuth2 flow is completed to obtain the new credentials.

    Returns:
        Credentials, the obtained credential.
    """
    home_dir = os.path.expanduser('~')
    credential_dir = os.path.join(home_dir, '.credentials')
    if not os.path.exists(credential_dir):
        os.makedirs(credential_dir)
    credential_path = os.path.join(credential_dir,
                                   'client_secret.json')

    store = Storage(credential_path)
    credentials = store.get()
    if not credentials or credentials.invalid:
        print(os.getcwd())
        flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
        flow.user_agent = APPLICATION_NAME
        if flags:
            credentials = tools.run_flow(flow, store, flags)
        else: # Needed only for compatibility with Python 2.6
            credentials = tools.run(flow, store)
        print('Storing credentials to ' + credential_path)
    return credentials

def main():
    """Shows basic usage of the Google Drive API.

    Creates a Google Drive API service object and outputs the names and IDs
    for up to 10 files.
    """
    credentials = get_credentials()
    http = credentials.authorize(httplib2.Http())
    service = discovery.build('drive', 'v3', http=http)

    results = service.files().list(
        pageSize=10,fields="nextPageToken, files(id, name)").execute()
    items = results.get('files', [])
    if not items:
        print('No files found.')
    else:
        print('Files:')
        for item in items:
            print('{0} ({1})'.format(item['name'], item['id']))

if __name__ == '__main__':
    main()

use

from __future__ import print_function
import httplib2
import os

from apiclient import discovery
from oauth2client import client
from oauth2client import tools
from oauth2client.file import Storage
import time

from apiclient import errors
from apiclient.http import MediaFileUpload
from apiclient import discovery
import oauth2client


try:
    import argparse
    flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
except ImportError:
    flags = None

# If modifying these scopes, delete your previously saved credentials
# at ~/.credentials/drive-python-quickstart.json
SCOPES = 'https://www.googleapis.com/auth/drive'
CLIENT_SECRET_FILE = 'client_secret.json'
APPLICATION_NAME = 'OCR'


def get_credentials():
    """Gets valid user credentials from storage.

    If nothing has been stored, or if the stored credentials are invalid,
    the OAuth2 flow is completed to obtain the new credentials.

    Returns:
        Credentials, the obtained credential.
    """
    home_dir = os.path.expanduser('~')
    credential_dir = os.path.join(home_dir, '.credentials')
    if not os.path.exists(credential_dir):
        os.makedirs(credential_dir)
    credential_path = os.path.join(credential_dir,
                                   'client_secret.json')

    store = Storage(credential_path)
    credentials = store.get()
    if not credentials or credentials.invalid:
        print(os.getcwd())
        flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
        flow.user_agent = APPLICATION_NAME
        if flags:
            credentials = tools.run_flow(flow, store, flags)
        else: # Needed only for compatibility with Python 2.6
            credentials = tools.run(flow, store)
        print('Storing credentials to ' + credential_path)
    return credentials


def insert_file(service, title, description, parent_id, mime_type, filename):
  """Insert new file.

  Args:
    service: Drive API service instance.
    title: Title of the file to insert, including the extension.
    description: Description of the file to insert.
    parent_id: Parent folder's ID.
    mime_type: MIME type of the file to insert.
    filename: Filename of the file to insert.
  Returns:
    Inserted file metadata if successful, None otherwise.
  """
  media_body = MediaFileUpload(filename, mimetype=mime_type, resumable=True)
  body = {
    'title': title,
    'description': description,
    'mimeType': mime_type
  }
  # Set the parent folder.
  if parent_id:
    body['parents'] = [{'id': parent_id}]

  try:
    file = service.files().insert(
        body=body,
        media_body=media_body,
        ocr=True).execute()

    # Uncomment the following line to print the File ID
    # print 'File ID: %s' % file['id']

    return file
  except errors.HttpError as error:
    print ('An error occured: %s' % error)
    return None


def download_file(service, drive_file):
  """Download a file's content.

  Args:
    service: Drive API service instance.
    drive_file: Drive File instance.

  Returns:
    File's content if successful, None otherwise.
  """
  download_url = drive_file['exportLinks']['text/plain']
  if download_url:
    resp, content = service._http.request(download_url)
    if resp.status == 200:
      # print ('Status: %s' % resp)
      return content
    else:
      print ('An error occurred: %s' % resp)
      return None
  else:
    # The file doesn't have any content stored on Drive.
    return None

def googleOCR(folderName,pdfList):
    """Shows basic usage of the Google Drive API.

    Creates a Google Drive API service object and outputs the names and IDs
    for up to 10 files.
    """
    credentials = get_credentials()
    http = credentials.authorize(httplib2.Http())
    service = discovery.build('drive', 'v2', http=http)

    # create folder for year in
    file_metadata = {
        'title': folderName,
        'mimeType': 'application/vnd.google-apps.folder'
    }
    rootFolder = service.files().insert(body=file_metadata,
                                        fields='id').execute()


    for pdf in pdfList:
        print(pdf)
        fileTitle = pdf.replace('.pdf','')
        print("OCR file")
        file = insert_file(service,fileTitle,'description',rootFolder.get('id'), 'application/pdf',pdf)
        print("done inserting file")
        time.sleep(3)
        print("downloading file")
        text = download_file(service,file)
        # text = str(text)
        print("done downloading file")
        with open(fileTitle + '.txt','wb') as outputFile:
            outputFile.write(text.strip())

AI 2

Algorithm 17

Amazon 1

Authorization 1

Blog 3

Bootstrap 1

C++ 1

CCpp 5

CSS 2

Cloud 3

Code 1

Crawler 1

DNS 1

Database 17

DeepLearning 1

Design 17

Development 1

Docker 1

English 1

Express 1

GDB 1

Go 3

Google 4

HTML 3

IOS 1

Java 17

Javascript 4

Jekyll 1

Linux 4

MacOS 2

MachineLearning 18

Markdown 4

Mobile 1

MongoDB 2

Multi-threading 3

NAS 1

Network 11

NeuralNetwork 10

Node 1

OS 8

Public-speaking 1

Python 15

RESTful 1

Rails 9

React 1

Redis 1

Ruby 6

Shell 2

Spring 2

System 17

TCP 1

TDD 1

Thread 2

Vim 1

awk 1

git 1

jQuery 1

media 1

network 1

php 1

Use Google Drive OCR

authenticate

use