Public speaking course notes Read "Dynamo, Amazon’s Highly Available Key-value Store" Read "Bigtable, A Distributed Storage System for Structured Data" Read "Streaming Systems" 3, Watermarks Read "Streaming Systems" 1&2, Streaming 101 Read "F1, a distributed SQL database that scales" Read "Zanzibar, Google’s Consistent, Global Authorization System" Read "Spanner, Google's Globally-Distributed Database" Read "Designing Data-intensive applications" 12, The Future of Data Systems IOS development with Swift Read "Designing Data-intensive applications" 10&11, Batch and Stream Processing Read "Designing Data-intensive applications" 9, Consistency and Consensus Read "Designing Data-intensive applications" 8, Distributed System Troubles Read "Designing Data-intensive applications" 7, Transactions Read "Designing Data-intensive applications" 6, Partitioning Read "Designing Data-intensive applications" 5, Replication Read "Designing Data-intensive applications" 3&4, Storage, Retrieval, Encoding Read "Designing Data-intensive applications" 1&2, Foundation of Data Systems Three cases of binary search TAMU Operating System 2 Memory Management TAMU Operating System 1 Introduction Overview in cloud computing 2 TAMU Operating System 7 Virtualization TAMU Operating System 6 File System TAMU Operating System 5 I/O and Disk Management TAMU Operating System 4 Synchronization TAMU Operating System 3 Concurrency and Threading TAMU Computer Networks 5 Data Link Layer TAMU Computer Networks 4 Network Layer TAMU Computer Networks 3 Transport Layer TAMU Computer Networks 2 Application Layer TAMU Computer Networks 1 Introduction Overview in distributed systems and cloud computing 1 A well-optimized Union-Find implementation, in Java A heap implementation supporting deletion TAMU Advanced Algorithms 3, Maximum Bandwidth Path (Dijkstra, MST, Linear) TAMU Advanced Algorithms 2, B+ tree and Segment Intersection TAMU Advanced Algorithms 1, BST, 2-3 Tree and Heap TAMU AI, Searching problems Factorization Machine and Field-aware Factorization Machine for CTR prediction TAMU Neural Network 10 Information-Theoretic Models TAMU Neural Network 9 Principal Component Analysis TAMU Neural Network 8 Neurodynamics TAMU Neural Network 7 Self-Organizing Maps TAMU Neural Network 6 Deep Learning Overview TAMU Neural Network 5 Radial-Basis Function Networks TAMU Neural Network 4 Multi-Layer Perceptrons TAMU Neural Network 3 Single-Layer Perceptrons Princeton Algorithms P1W6 Hash Tables & Symbol Table Applications Stanford ML 11 Application Example Photo OCR Stanford ML 10 Large Scale Machine Learning Stanford ML 9 Anomaly Detection and Recommender Systems Stanford ML 8 Clustering & Principal Component Analysis Princeton Algorithms P1W5 Balanced Search Trees TAMU Neural Network 2 Learning Processes TAMU Neural Network 1 Introduction Stanford ML 7 Support Vector Machine Stanford ML 6 Evaluate Algorithms Princeton Algorithms P1W4 Priority Queues and Symbol Tables Stanford ML 5 Neural Networks Learning Princeton Algorithms P1W3 Mergesort and Quicksort Stanford ML 4 Neural Networks Basics Princeton Algorithms P1W2 Stack and Queue, Basic Sorts Stanford ML 3 Classification Problems Stanford ML 2 Multivariate Regression and Normal Equation Princeton Algorithms P1W1 Union and Find Stanford ML 1 Introduction and Parameter Learning

Use Google Drive OCR

2017-03-23

authenticate

https://developers.google.com/drive/v3/web/quickstart/python#step_1_turn_on_the_api_name

Run following code

from __future__ import print_function
import httplib2
import os

from apiclient import discovery
from oauth2client import client
from oauth2client import tools
from oauth2client.file import Storage

try:
    import argparse
    flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
except ImportError:
    flags = None

# If modifying these scopes, delete your previously saved credentials
# at ~/.credentials/drive-python-quickstart.json
SCOPES = 'https://www.googleapis.com/auth/drive'
CLIENT_SECRET_FILE = 'client_secret.json'
APPLICATION_NAME = 'OCR'


def get_credentials():
    """Gets valid user credentials from storage.

    If nothing has been stored, or if the stored credentials are invalid,
    the OAuth2 flow is completed to obtain the new credentials.

    Returns:
        Credentials, the obtained credential.
    """
    home_dir = os.path.expanduser('~')
    credential_dir = os.path.join(home_dir, '.credentials')
    if not os.path.exists(credential_dir):
        os.makedirs(credential_dir)
    credential_path = os.path.join(credential_dir,
                                   'client_secret.json')

    store = Storage(credential_path)
    credentials = store.get()
    if not credentials or credentials.invalid:
        print(os.getcwd())
        flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
        flow.user_agent = APPLICATION_NAME
        if flags:
            credentials = tools.run_flow(flow, store, flags)
        else: # Needed only for compatibility with Python 2.6
            credentials = tools.run(flow, store)
        print('Storing credentials to ' + credential_path)
    return credentials

def main():
    """Shows basic usage of the Google Drive API.

    Creates a Google Drive API service object and outputs the names and IDs
    for up to 10 files.
    """
    credentials = get_credentials()
    http = credentials.authorize(httplib2.Http())
    service = discovery.build('drive', 'v3', http=http)

    results = service.files().list(
        pageSize=10,fields="nextPageToken, files(id, name)").execute()
    items = results.get('files', [])
    if not items:
        print('No files found.')
    else:
        print('Files:')
        for item in items:
            print('{0} ({1})'.format(item['name'], item['id']))

if __name__ == '__main__':
    main()

use

from __future__ import print_function
import httplib2
import os

from apiclient import discovery
from oauth2client import client
from oauth2client import tools
from oauth2client.file import Storage
import time

from apiclient import errors
from apiclient.http import MediaFileUpload
from apiclient import discovery
import oauth2client


try:
    import argparse
    flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
except ImportError:
    flags = None

# If modifying these scopes, delete your previously saved credentials
# at ~/.credentials/drive-python-quickstart.json
SCOPES = 'https://www.googleapis.com/auth/drive'
CLIENT_SECRET_FILE = 'client_secret.json'
APPLICATION_NAME = 'OCR'


def get_credentials():
    """Gets valid user credentials from storage.

    If nothing has been stored, or if the stored credentials are invalid,
    the OAuth2 flow is completed to obtain the new credentials.

    Returns:
        Credentials, the obtained credential.
    """
    home_dir = os.path.expanduser('~')
    credential_dir = os.path.join(home_dir, '.credentials')
    if not os.path.exists(credential_dir):
        os.makedirs(credential_dir)
    credential_path = os.path.join(credential_dir,
                                   'client_secret.json')

    store = Storage(credential_path)
    credentials = store.get()
    if not credentials or credentials.invalid:
        print(os.getcwd())
        flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
        flow.user_agent = APPLICATION_NAME
        if flags:
            credentials = tools.run_flow(flow, store, flags)
        else: # Needed only for compatibility with Python 2.6
            credentials = tools.run(flow, store)
        print('Storing credentials to ' + credential_path)
    return credentials


def insert_file(service, title, description, parent_id, mime_type, filename):
  """Insert new file.

  Args:
    service: Drive API service instance.
    title: Title of the file to insert, including the extension.
    description: Description of the file to insert.
    parent_id: Parent folder's ID.
    mime_type: MIME type of the file to insert.
    filename: Filename of the file to insert.
  Returns:
    Inserted file metadata if successful, None otherwise.
  """
  media_body = MediaFileUpload(filename, mimetype=mime_type, resumable=True)
  body = {
    'title': title,
    'description': description,
    'mimeType': mime_type
  }
  # Set the parent folder.
  if parent_id:
    body['parents'] = [{'id': parent_id}]

  try:
    file = service.files().insert(
        body=body,
        media_body=media_body,
        ocr=True).execute()

    # Uncomment the following line to print the File ID
    # print 'File ID: %s' % file['id']

    return file
  except errors.HttpError as error:
    print ('An error occured: %s' % error)
    return None


def download_file(service, drive_file):
  """Download a file's content.

  Args:
    service: Drive API service instance.
    drive_file: Drive File instance.

  Returns:
    File's content if successful, None otherwise.
  """
  download_url = drive_file['exportLinks']['text/plain']
  if download_url:
    resp, content = service._http.request(download_url)
    if resp.status == 200:
      # print ('Status: %s' % resp)
      return content
    else:
      print ('An error occurred: %s' % resp)
      return None
  else:
    # The file doesn't have any content stored on Drive.
    return None

def googleOCR(folderName,pdfList):
    """Shows basic usage of the Google Drive API.

    Creates a Google Drive API service object and outputs the names and IDs
    for up to 10 files.
    """
    credentials = get_credentials()
    http = credentials.authorize(httplib2.Http())
    service = discovery.build('drive', 'v2', http=http)

    # create folder for year in
    file_metadata = {
        'title': folderName,
        'mimeType': 'application/vnd.google-apps.folder'
    }
    rootFolder = service.files().insert(body=file_metadata,
                                        fields='id').execute()


    for pdf in pdfList:
        print(pdf)
        fileTitle = pdf.replace('.pdf','')
        print("OCR file")
        file = insert_file(service,fileTitle,'description',rootFolder.get('id'), 'application/pdf',pdf)
        print("done inserting file")
        time.sleep(3)
        print("downloading file")
        text = download_file(service,file)
        # text = str(text)
        print("done downloading file")
        with open(fileTitle + '.txt','wb') as outputFile:
            outputFile.write(text.strip())

Creative Commons License
Melon blog is created by melonskin. This work is licensed under a Creative Commons Attribution-NonCommercial 4.0 International License.
© 2016-2024. All rights reserved by melonskin. Powered by Jekyll.